add polyA-tail annotation to Dorado war signal QC plots

Krish Agarwal · Krish Agarwal · commit 8380ac685163 · 2026-02-18T14:44:09.000+01:00
- Parse pa:B:i tag from Dorado BAM output.
- Map polyA-tail positions onto war signal timeline.
- Ensure visualization distinguishes ployA region from basecalled region.
diff --git a/bin/plot_signal.py b/bin/plot_signal.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+import matplotlib
+matplotlib.use('Agg') # Force non-interactive backend for HPC
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import argparse
+import os
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--csv', required=True, help="Input annotated CSV")
+    parser.add_argument(
+        '--output',
+        required=True,
+        help="Output figure filename (extension controls format, e.g. .png, .pdf)"
+    )
+    parser.add_argument('--title', default="Nanopore Read Signal")
+    parser.add_argument(
+        '--figwidth',
+        type=float,
+        default=15.0,
+        help="Figure width in inches (default: 15.0)"
+    )
+    parser.add_argument(
+        '--figheight',
+        type=float,
+        default=3.2,
+        help="Figure height in inches (default: 3.2)"
+    )
+    args = parser.parse_args()
+
+    # Load the data generated by pod5_to_df.py
+    raw_signal_df = pd.read_csv(args.csv)
+
+    # Plotting configuration
+    sns.set(font_scale=1)
+    sns.set_style("white")
+    fig, ax = plt.subplots(1, 1, figsize=(args.figwidth, args.figheight))
+    x_feature, y_feature = 'time', 'signal'
+
+    # 1. Unannotated part (ann == -2)
+    df_neg2 = raw_signal_df[raw_signal_df['ann'] == -2]
+    if not df_neg2.empty:
+        sns.scatterplot(data=df_neg2, x=x_feature, y=y_feature, color='blue', 
+                        label='unannotated part', s=50, zorder=4, ax=ax)
+
+    # 2. Trimmed primer/adapter (ann == -1)
+    df_neg1 = raw_signal_df[raw_signal_df['ann'] == -1]
+    if not df_neg1.empty:
+        sns.lineplot(data=df_neg1, x=x_feature, y=y_feature, color='green', 
+                     label='trimmed primer and adapter', zorder=2, ax=ax)
+
+    # 3. Basecalled region (ann is 0 or 1)
+    df_base = raw_signal_df[raw_signal_df['ann'].isin([0, 1])]
+    if not df_base.empty:
+        sns.lineplot(data=df_base, x=x_feature, y=y_feature, color='orange', 
+                     label='basecalled region', zorder=3, ax=ax)
+
+    # 4. Poly-A Tail Region (polyA > -1)
+    # We highlight the region identified by the 'pa' tag anchors
+    df_polya = raw_signal_df[raw_signal_df['polyA'] > -1]
+    if not df_polya.empty:
+        # We use a distinct color (magenta) to show the estimated tail
+        sns.lineplot(data=df_polya, x=x_feature, y=y_feature, color='magenta', 
+                     label='polyA-tail region', zorder=5, linewidth=2, ax=ax)
+
+    # 5. Samples that emit bases (ann == 1) - Red Circles
+    df_emit = raw_signal_df[raw_signal_df['ann'] == 1]
+    if not df_emit.empty:
+        sns.scatterplot(data=df_emit, x=x_feature, y=y_feature, color='red', 
+                        label='samples that emit bases', s=50, fc="none", ec='red', zorder=6, ax=ax)
+
+    ax.set(title=args.title)
+    
+    # Legend placement
+    ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
+    plt.tight_layout()
+    plt.savefig(args.output, dpi=300)
+    plt.close()
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/pod5_to_df.py b/bin/pod5_to_df.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+import pod5 as p5
+import pandas as pd
+import numpy as np
+import pysam
+import argparse
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--pod5', required=True)
+    parser.add_argument('--bam', required=True)
+    parser.add_argument('--sample_id', required=True)
+    args = parser.parse_args()
+
+    # 1. Load BAM data including Poly-A estimation tags
+    bam_data = {}
+    with pysam.AlignmentFile(args.bam, "rb", check_sq=False) as bam:
+        for read in bam.fetch(until_eof=True):
+            bam_data[read.query_name] = {
+                'seq': read.query_sequence,
+                'mv': read.get_tag("mv") if read.has_tag("mv") else None,
+                'ts': read.get_tag("ts") if read.has_tag("ts") else 0,
+                'ns': read.get_tag("ns") if read.has_tag("ns") else 0,
+                'pt': read.get_tag("pt") if read.has_tag("pt") else 0,  # Poly-A length
+                'pa': read.get_tag("pa") if read.has_tag("pa") else None # Signal anchors
+            }
+
+    with p5.Reader(args.pod5) as reader:
+        for read_record in reader.reads():
+            read_id = str(read_record.read_id)
+            if read_id not in bam_data:
+                continue
+
+            # Signal extraction
+            signal = read_record.signal
+            sample_rate = read_record.run_info.sample_rate
+            time = np.arange(len(signal)) / sample_rate
+            raw_signal_df = pd.DataFrame({'time': time, 'signal': signal})
+
+            info = bam_data[read_id]
+            
+            # Initialize polyA column (default to -1 for non-tail region)
+            raw_signal_df['polyA'] = -1
+
+            # 2. Map Poly-A Region if tags exist
+            if info['pa'] is not None:
+                # pa array indices: 1 = start of polyA, 2 = end of polyA
+                pa_start = info['pa'][1]
+                pa_end = info['pa'][2]
+                
+                # Fill the polyA column with the estimated length (pt) for that region
+                # This makes it easy to filter/color the tail in plots
+                raw_signal_df.loc[pa_start:pa_end, 'polyA'] = info['pt']
+
+            # 3. Handle Moves and Base Mapping
+            if info['mv'] is not None:
+                stride = info['mv'][0]
+                moves = info['mv'][1:]
+                sequence = info['seq']
+                
+                # Build 'ann' (moves) array
+                a = info['ts'] * [-1]
+                for elem in moves:
+                    a += [elem] * stride
+                a += (len(raw_signal_df) - info['ns']) * [-1]
+                a = [-2] * max((len(raw_signal_df) - len(a)), 0) + a
+                
+                # Trim or pad 'a' to match signal length exactly
+                if len(a) > len(raw_signal_df):
+                    a = a[:len(raw_signal_df)]
+                else:
+                    a += [-1] * (len(raw_signal_df) - len(a))
+                
+                raw_signal_df['ann'] = a
+
+                # 4. Map Nucleotides to Signal
+                base_labels = ['N'] * len(raw_signal_df)
+                seq_idx = 0
+                seq_len = len(sequence)
+
+                for i, val in enumerate(a):
+                    if val == 1 and seq_idx < seq_len:
+                        current_base = sequence[seq_idx]
+                        base_labels[i] = current_base
+                        seq_idx += 1
+                    elif val == 0 and seq_idx > 0:
+                        base_labels[i] = sequence[seq_idx - 1]
+
+                raw_signal_df['base'] = base_labels
+
+            # Save annotated CSV
+            output_name = f"{args.sample_id}_{read_id}_mapped.csv"
+            raw_signal_df.to_csv(output_name, index=False)
+
+if __name__ == "__main__":
+    main()
diff --git a/main.nf b/main.nf
@@ -5,19 +5,33 @@ nextflow.enable.dsl = 2
 if( !params.tsv ) { exit 1, "Please provide the input TSV file with --tsv" }
 
 workflow {
-
-    // Using lowercase 'channel' avoids the ConfigObject error
     samples_ch = channel
         .fromPath(params.tsv)
         .splitCsv(header: true, sep: '\t')
-        .map { row -> 
-            if (!row.sample_id || !row.pod5) {
-                error "TSV missing required columns 'sample_id' or 'pod5'"
-            }
-            tuple(row.sample_id, file(row.pod5)) 
-        }
+        .map { row -> tuple(row.sample_id, file(row.pod5)) }
 
+    // 1. Initial basecall
     dorado_basecall(samples_ch)
+
+    // 2. Extract random IDs
+    extract_read_ids(dorado_basecall.out.bam)
+
+    // 3. Create subset POD5
+    filter_input_ch = samples_ch.join(extract_read_ids.out.ids_file)
+    
+    filter_pod5(filter_input_ch)
+
+    // 4. Re-run Dorado for moves (Needs the POD5 subset)
+    dorado_emit_moves(filter_pod5.out)
+
+    // Join POD5 and BAM before generating CSV
+    // filter_pod5.out is [sample_id, pod5]
+    // dorado_emit_moves.out.bam is [sample_id, bam]
+    final_input_ch = filter_pod5.out.join(dorado_emit_moves.out.bam)
+    
+    generate_signal_df(final_input_ch)
+
+    visualize_signal(generate_signal_df.out.flatten())
 }
 
 process dorado_basecall {
@@ -51,4 +65,99 @@ process dorado_basecall {
     # Index the resulting BAM
     samtools index -@ ${task.cpus} \$OUT_BAM
     """
-}
+}
+
+process extract_read_ids {
+    tag "${sample_id}"
+    input:
+    tuple val(sample_id), path(bam)
+
+    output:
+    tuple val(sample_id), path("${sample_id}_read_ids.txt"), emit: ids_file
+
+    script:
+    """
+    samtools view ${bam} | cut -f1 | sort -u | shuf -n ${params.num_reads} > ${sample_id}_read_ids.txt
+    """
+}
+
+process filter_pod5 {
+    tag "${sample_id}"
+    publishDir "${params.outdir}/subsampled_pod5", mode: 'copy'
+
+    input:
+    // This matches the output of the .join()
+    tuple val(sample_id), path(original_pod5), path(read_ids_txt)
+
+    output:
+    tuple val(sample_id), path("${sample_id}.subset.pod5")
+
+    script:
+    """
+    pod5 filter ${original_pod5} --output ${sample_id}.subset.pod5 --ids ${read_ids_txt} --force-overwrite
+    """
+}
+
+process dorado_emit_moves {
+    tag "${sample_id}"
+    publishDir "${params.outdir}/moves_bam", mode: 'copy'
+    
+    input:
+    tuple val(sample_id), path(subset_pod5)
+
+    output:
+    tuple val(sample_id), path("${sample_id}.moves.bam"), emit: bam
+
+    script:
+    """
+    # Pipe Dorado output to samtools to ensure a valid, compressed BAM with header
+    ${params.dorado} basecaller \\
+        ${params.model} \\
+        ${subset_pod5} \\
+        --emit-moves \\
+        --estimate-poly-a \\
+        --poly-a-config ${params.polyA} \\
+        --mm2-opts "-x splice -Y" \\
+        --reference ${params.ref} \\
+        --device "cuda:all" | samtools view -bS - > ${sample_id}.moves.bam
+    """
+}
+
+process generate_signal_df {
+    tag "${sample_id}"
+    publishDir "${params.outdir}/annotated_data", mode: 'copy'
+
+    input:
+    tuple val(sample_id), path(subset_pod5), path(moves_bam)
+
+    output:
+    path "*.csv"
+
+    script:
+    """
+    pod5_to_df.py --pod5 ${subset_pod5} --bam ${moves_bam} --sample_id ${sample_id}
+    """
+}
+
+process visualize_signal {
+    tag "${csv.baseName}"
+    publishDir "${params.outdir}/plots", mode: 'copy'
+
+    input:
+    path csv
+
+    output:
+    path "*.pdf"
+
+    script:
+    def read_id = csv.baseName.replace("_mapped", "")
+    """
+    plot_signal.py \\
+        --csv ${csv} \\
+        --output ${read_id}.pdf \\
+        --title "Read: ${read_id}" \\
+        --figwidth ${params.figwidth} \\
+        --figheight ${params.figheight}
+    """
+}
+
diff --git a/nextflow.config b/nextflow.config
@@ -13,27 +13,42 @@ params {
     model      = "${myHome}/nanoflowz/rna004_130bps_sup@v5.1.0"
     polyA      = "${myHome}/materials/polya_config.toml"
     ref        = "${myHome}/nanoflowz/GRCm38.primary_assembly.genome.fa"
+
+    num_reads  = 10   // Default number of read IDs to sample
+
+    // Plot configuration
+    figwidth  = 15.0  // Figure width in inches
+    figheight = 3.2   // Figure height in inches
 }
 
 process {
     executor = 'slurm'
-    queue    = 'a100'
-    
-    // Resource requests
-    cpus   = 6
-    memory = '40 GB'
-    time   = '29m'
-    
-    // GPU and Cluster specific options
-    clusterOptions = '--gres=gpu:1 --qos=a100-1day'
-
-    // Conda integration
-    // Using the path to the environment directly is most efficient on Slurm
-    conda = params.condaEnv
-
-    // This ensures the compute node shell can find the 'conda' command 
-    // to perform the internal environment activation
-    beforeScript = "pwd; source ${myHome}/miniconda3/etc/profile.d/conda.sh"
+    conda    = params.condaEnv
+    beforeScript = """
+        source ${myHome}/miniconda3/etc/profile.d/conda.sh
+        export LD_LIBRARY_PATH=${myHome}/miniconda3/envs/nanopore/lib:\$LD_LIBRARY_PATH
+    """
+
+    // Default resources for CPU-only processes 
+    // (extract_read_ids, filter_pod5, generate_signal_df)
+    cpus   = 1
+    memory = '4 GB'
+    time   = '20m'
+
+    // Specific overrides for ONLY the basecaller
+    withName: 'dorado_basecall|dorado_emit_moves' {
+        queue          = 'a100'
+        cpus           = 6
+        memory         = '40 GB'
+        time           = '29m'
+        clusterOptions = '--gres=gpu:1 --qos=a100-1day'
+    }
+
+    withName: visualize_signal {
+        cpus   = 1
+        memory = '8 GB' // Seaborn can be hungry with 100k+ points
+        time   = '15m'
+    }
 }
 
 conda {