Merge pull request #39 from zavolanlab/aleksei/strand_tags_and_isoforms

magmir71 · web-flow · commit fcc099a02b38 · 2026-04-22T09:07:22.000+02:00
implemented UMI-based deduplication, appending of polyA tails, accounting for strandedness
diff --git a/README.md b/README.md
@@ -6,15 +6,19 @@ It is tailored for GPU-accelerated basecalling, producing quality control plots,
 
 The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Conda environments to automatically manage and isolate software dependencies.
 
+The pipeline takes advantage of several functions implemented in [zavolab_pyutils](https://github.com/zavolanlab/zavolab_pyutils/tree/dev) and [SCINPAS](https://github.com/zavolanlab/SCINPAS/tree/making_CLI_interface_for_python_functions) packages.
+
 ## Pipeline Summary
 1. **GPU Basecalling (`dorado`)**: Performs basecalling and poly-A tail estimation natively on NVIDIA GPUs.
-2. **Alignment & Merging (`minimap2`)**: Maps reads to the reference genome using minimap2 and merges BAMs (e.g. technical replicates from samples).
-3. **De novo transcriptome assembly**: Enriches input transcriptome annotations based on observed read alignments
-4. **Isoform Analysis (`custom python`)**: Assigns alignments to specific transcript isoforms.
-5. **Annotating raw current data from a subsample of reads**: 
-5.1 **Signal Extraction (`pod5`)**: Subsamples reads of interest and extracts corresponding raw signal chunks from `.pod5` files.
-5.2 **Move-table Emission (`dorado`)**: Re-processes subsetted reads to emit basecaller move-tables. 
-5.3 **Dataframe Generation & QC Visualization (`seaborn`)**: Synchronizes sequence strings with raw signal variations and renders PDF plots for individual reads.
+2. **Reverse-complementing reads from backward strand**: sequencing can be done in both directions in the cDNA protocol (optional).
+3. **Alignment & Merging (`minimap2`)**: Maps reads to the reference genome using minimap2 and merges BAMs (e.g. technical replicates from samples).
+4. **UMI deduplication (`umi_tools`, `custom python`)**: normalizing UMI lengths -> UMI- and alignment-based deduplication -> correction of NH tag and MAPQ values.
+5. **De novo transcriptome assembly (`custom python`)**: Enriches input transcriptome annotations based on observed read alignments
+6. **Isoform Analysis (`custom python`)**: Assigns alignments to specific transcript isoforms.
+7. **QC: annotating raw current data from a subsample of reads**: 
+7.1 **Signal Extraction (`pod5`)**: Subsamples reads of interest and extracts corresponding raw signal chunks from `.pod5` files.
+7.2 **Move-table Emission (`dorado`)**: Re-processes subsetted reads to emit basecaller move-tables. 
+7.3 **Dataframe Generation & QC Visualization (`seaborn`)**: Synchronizes sequence strings with raw signal variations and renders PDF plots for individual reads.
 
 ## Quick Start
 
diff --git a/bin/append_polyA_tail.py b/bin/append_polyA_tail.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+
+import warnings
+warnings.simplefilter('ignore')
+
+import sys
+from argparse import ArgumentParser, RawTextHelpFormatter
+import os
+import numpy as np
+import pysam
+import csv
+
+def get_rc(seq):
+    """Return the reverse complement of a sequence."""
+    trans = str.maketrans("ACGTUacgtuNn", "TGCAAtgcaaNn")
+    return seq.translate(trans)[::-1]
+
+def main():
+    parser = ArgumentParser(description="Append polyA tails based on pt and fixed cleavage site tags.")
+    parser.add_argument("--input_bam", required=True)
+    parser.add_argument("--output_appended_bam", required=True)
+    parser.add_argument("--output_skipped_bam", required=True)
+    parser.add_argument("--stats_tsv", required=True)
+    parser.add_argument("--sample_id", required=True)
+    parser.add_argument("--tag_orig_cs", default="XO")
+    parser.add_argument("--tag_fixed_cs", default="XF")
+    
+    args = parser.parse_args()
+    
+    almnt_file = pysam.AlignmentFile(args.input_bam, "rb")
+    bam_appended = pysam.AlignmentFile(args.output_appended_bam, "wb", header=almnt_file.header)
+    bam_skipped = pysam.AlignmentFile(args.output_skipped_bam, "wb", header=almnt_file.header)
+    
+    reads_total = 0
+    reads_appended = 0
+    reads_skipped_no_pt = 0
+    reads_skipped_pt_outside_valid_range = 0
+    reads_skipped_error = 0
+    
+    for almnt in almnt_file:
+        reads_total += 1
+        
+        # 1. Check for valid pt tag
+        try:
+            pt = almnt.get_tag('pt')
+        except KeyError:
+            pt = -1
+            reads_skipped_no_pt += 1
+            bam_skipped.write(almnt)
+            continue
+            
+        if pt <= 0:
+            bam_skipped.write(almnt)
+            reads_skipped_pt_outside_valid_range += 1
+            continue
+            
+        # 2. Get Cleavage Site Shift Difference
+        try:
+            OCS = almnt.get_tag(args.tag_orig_cs)
+            FCS = almnt.get_tag(args.tag_fixed_cs)
+            # The absolute difference is exactly how many bases were "rescued" from the softclip
+            difference = abs(OCS - FCS)
+        except KeyError:
+            bam_skipped.write(almnt)
+            reads_skipped_error += 1
+            continue
+
+        # 3. Orient to transcript 5' -> 3'
+        if almnt.is_forward:
+            read_seq = almnt.query_sequence
+            read_qualstr = almnt.query_qualities
+            cigar = list(almnt.cigar)
+        else:
+            read_seq = almnt.get_forward_sequence()
+            read_qualstr = almnt.query_qualities[::-1] if almnt.query_qualities else None
+            cigar = list(almnt.cigar)[::-1]
+            
+        # 4. Calculate exact truncation
+        old_clip_len = cigar[-1][1] if cigar[-1][0] == 4 else 0
+        trim_len = old_clip_len - difference
+        
+        if trim_len < 0:
+            bam_skipped.write(almnt)
+            reads_skipped_error += 1
+            continue
+
+        # 5. Modify Sequence
+        new_read_seq = read_seq[:(-trim_len if trim_len > 0 else None)] + "A" * pt
+
+        # 6. Modify Quality String
+        if read_qualstr is not None:
+            if trim_len > 0:
+                trimmed_quals = read_qualstr[-trim_len:]
+            else:
+                trimmed_quals = read_qualstr[-(min(5, len(read_seq))):] if len(read_seq) > 0 else [30]
+            
+            quality_val = int(np.round(np.mean(trimmed_quals), 0)) if len(trimmed_quals) > 0 else 30
+                
+            new_read_qualstr = list(read_qualstr[:(-trim_len if trim_len > 0 else None)])
+            new_read_qualstr.extend([quality_val] * pt)
+        else:
+            new_read_qualstr = None
+
+        # 7. Modify CIGAR
+        new_cigar = [[op, length] for op, length in cigar]
+            
+        if new_cigar[-1][0] == 4:
+            new_cigar.pop() # Remove old 3' soft-clip
+            
+        if difference > 0:
+            for idx in range(len(new_cigar)-1, -1, -1):
+                if new_cigar[idx][0] == 0: # Add rescued bases to the last Match (M) block
+                    new_cigar[idx][1] += difference
+                    break
+                    
+        if pt > 0:
+            new_cigar.append([4, pt]) # Append new soft-clip of length pt
+            
+        # Validation Check
+        cigar_query_len = sum(length for op, length in new_cigar if op in [0, 1, 4, 7, 8])
+        if len(new_read_seq) != cigar_query_len or (new_read_qualstr and len(new_read_seq) != len(new_read_qualstr)):
+            bam_skipped.write(almnt)
+            reads_skipped_error += 1
+            continue
+            
+        # 8. Convert back to original BAM orientation
+        if almnt.is_forward:
+            almnt.query_sequence = new_read_seq
+            almnt.query_qualities = new_read_qualstr
+            almnt.cigar = new_cigar
+        else:
+            almnt.query_sequence = get_rc(new_read_seq)
+            if new_read_qualstr is not None:
+                almnt.query_qualities = new_read_qualstr[::-1]
+            almnt.cigar = new_cigar[::-1]
+            
+        almnt.set_tag('pa', 1, 'i')
+        bam_appended.write(almnt)
+        reads_appended += 1
+        
+    almnt_file.close()
+    bam_appended.close()
+    bam_skipped.close()
+    
+    # 9. Write TSV Stats
+    with open(args.stats_tsv, 'w', newline='') as tsv_file:
+        writer = csv.writer(tsv_file, delimiter='\t')
+        writer.writerow(["sample_id", "chunk_filename", "total_alignments", "appended_alignments", "skipped_no_pt", "skipped_error", "skipped_pt_outside_valid_range"])
+        chunk_name = os.path.basename(args.input_bam)
+        writer.writerow([args.sample_id, chunk_name, reads_total, reads_appended, reads_skipped_no_pt, reads_skipped_error, reads_skipped_pt_outside_valid_range])
+
+if __name__ == '__main__':
+    try:
+        main()
+    except KeyboardInterrupt:
+        sys.stderr.write("User interrupt!")
+        sys.exit(1)
diff --git a/bin/extract_cs_bigwig.py b/bin/extract_cs_bigwig.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+import pysam
+import argparse
+import sys
+
+def main():
+    parser = argparse.ArgumentParser(description="Extract 1-based cleavage sites from a BAM custom tag and convert to 0-based BED format.")
+    parser.add_argument("--bam_in", required=True, help="Input BAM file with uniquely mapped reads")
+    parser.add_argument("--bed_out", required=True, help="Output BED file")
+    parser.add_argument("--tag", required=True, help="The SAM tag storing the 1-based cleavage site (e.g., XF or XO)")
+    parser.add_argument("--MAPQ_min", type=int, required=False, default=255, help="minimal MAPQ for an alignment to be included")
+    args = parser.parse_args()
+
+    with pysam.AlignmentFile(args.bam_in, "rb") as bam, open(args.bed_out, "w") as f:
+        for r in bam:
+            # MAPQ 255 represents unique alignments in your pipeline
+            if r.mapping_quality >= args.MAPQ_min and not r.is_unmapped:
+                try:
+                    cs_1based = r.get_tag(args.tag)
+                    # Convert 1-based tag coordinate to 0-based BED interval (length = 1)
+                    start = cs_1based - 1
+                    end = cs_1based
+                    strand = "-" if r.is_reverse else "+"
+                    
+                    # Write in BED6 format: chrom, start, end, name, score, strand
+                    f.write(f"{r.reference_name}\t{start}\t{end}\t{r.query_name}\t0\t{strand}\n")
+                except KeyError:
+                    # Skip reads that are missing the designated cleavage site tag
+                    pass
+
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+        sys.stderr.write("User interrupt!\n")
+        sys.exit(1)
diff --git a/bin/orient_reads.py b/bin/orient_reads.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+
+import pysam
+import argparse
+import csv
+import sys
+import os
+
+def get_rc(seq):
+    """Return the reverse complement of a sequence."""
+    trans = str.maketrans("ACGTUacgtuNn", "TGCAAtgcaaNn")
+    return seq.translate(trans)[::-1]
+
+def main():
+    parser = argparse.ArgumentParser(description="Orient unmapped BAM reads to forward strand based on basecaller tag.")
+    parser.add_argument("--input_bam", required=True, help="Input unmapped BAM file")
+    parser.add_argument("--output_bam", required=True, help="Output oriented BAM file")
+    parser.add_argument("--stats_tsv", required=True, help="Output TSV with tag statistics")
+    parser.add_argument("--sample_id", required=True, help="Sample ID for stats tracking")
+    
+    # NEW ARGUMENTS FOR DYNAMIC TAGGING
+    parser.add_argument("--tag_basecaller_ts", required=True, help="Tag storing the basecaller strand (e.g., TS)")
+    parser.add_argument("--tag_original_ts", required=True, help="Custom tag to store the original basecaller strand (e.g., ZS)")
+    args = parser.parse_args()
+
+    reads_with_ts = 0
+    reads_without_ts = 0
+    reads_reverse_complemented = 0
+
+    # check_sq=False is critical because unmapped BAMs lack reference contig headers
+    with pysam.AlignmentFile(args.input_bam, "rb", check_sq=False) as bam_in, \
+         pysam.AlignmentFile(args.output_bam, "wb", template=bam_in) as bam_out:
+        
+        for read in bam_in:
+            try:
+                # 1. Fetch the original basecaller tag
+                ts = read.get_tag(args.tag_basecaller_ts)
+                reads_with_ts += 1
+                
+                # 2. Store it as a custom tag for traceability
+                read.set_tag(args.tag_original_ts, ts, value_type="A")
+                
+                # 3. Safely delete the original tag using pysam's in-place deletion
+                # This bypasses the set_tags() bug with 'B' type binary arrays!
+                read.set_tag(args.tag_basecaller_ts, None)
+                
+                # 4. Flip the sequence and quality scores if Dorado marked it as negative
+                if ts == "-":
+                    # Reverse complement sequence
+                    orig_seq = read.query_sequence
+                    read.query_sequence = get_rc(orig_seq)
+                    
+                    # Reverse quality scores
+                    orig_qual = read.query_qualities
+                    if orig_qual is not None:
+                        read.query_qualities = orig_qual[::-1]
+                    
+                    reads_reverse_complemented += 1
+                    
+            except KeyError:
+                reads_without_ts += 1
+                
+            bam_out.write(read)
+
+    # Write the chunk statistics to a TSV
+    with open(args.stats_tsv, 'w', newline='') as tsv_file:
+        writer = csv.writer(tsv_file, delimiter='\t')
+        # Dynamically set header based on the provided tag name
+        writer.writerow(["sample_id", "chunk_filename", f"reads_with_{args.tag_basecaller_ts}", f"reads_without_{args.tag_basecaller_ts}", "reads_reverse_complemented"])
+        chunk_name = os.path.basename(args.input_bam)
+        writer.writerow([args.sample_id, chunk_name, reads_with_ts, reads_without_ts, reads_reverse_complemented])
+
+if __name__ == "__main__":
+    main()
diff --git a/envs/processing_bam_with_python.yml b/envs/processing_bam_with_python.yml
@@ -0,0 +1,18 @@
+name: nf-bam_processing_with_python
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - python=3.10
+  - bioconda::bedtools # non-python dependency for zavolab_pyutils
+  - bioconda::samtools # non-python dependency, always needed for BAM processing
+  - conda-forge::pigz
+  - bioconda::ucsc-wigtobigwig
+  - bioconda::ucsc-bedgraphtobigwig
+  - pip
+  - pip:
+      # This tells pip to install directly from the branches of the GitHub repos
+      - git+https://github.com/zavolanlab/zavolab_pyutils.git@dev
+      - git+https://github.com/zavolanlab/SCINPAS.git@making_CLI_interface_for_python_functions
+
+# Cache Bust: Added April 21 14:39 to force pip pulling the latest version of zavolab_pyutils
diff --git a/envs/umi_tools.yml b/envs/umi_tools.yml
@@ -0,0 +1,8 @@
+name: nf-umi_tools
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - python=3.10
+  - bioconda::umi_tools
+  - bioconda::samtools
diff --git a/main.nf b/main.nf
diff --git a/nextflow.config b/nextflow.config