Skip to content

Commit 8380ac6

Browse files
author
Krish Agarwal
committed
add polyA-tail annotation to Dorado war signal QC plots
- Parse pa:B:i tag from Dorado BAM output. - Map polyA-tail positions onto war signal timeline. - Ensure visualization distinguishes ployA region from basecalled region.
1 parent 24ce5c5 commit 8380ac6

4 files changed

Lines changed: 329 additions & 26 deletions

File tree

bin/plot_signal.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#!/usr/bin/env python3
2+
import matplotlib
3+
matplotlib.use('Agg') # Force non-interactive backend for HPC
4+
import pandas as pd
5+
import seaborn as sns
6+
import matplotlib.pyplot as plt
7+
import argparse
8+
import os
9+
10+
def main():
11+
parser = argparse.ArgumentParser()
12+
parser.add_argument('--csv', required=True, help="Input annotated CSV")
13+
parser.add_argument(
14+
'--output',
15+
required=True,
16+
help="Output figure filename (extension controls format, e.g. .png, .pdf)"
17+
)
18+
parser.add_argument('--title', default="Nanopore Read Signal")
19+
parser.add_argument(
20+
'--figwidth',
21+
type=float,
22+
default=15.0,
23+
help="Figure width in inches (default: 15.0)"
24+
)
25+
parser.add_argument(
26+
'--figheight',
27+
type=float,
28+
default=3.2,
29+
help="Figure height in inches (default: 3.2)"
30+
)
31+
args = parser.parse_args()
32+
33+
# Load the data generated by pod5_to_df.py
34+
raw_signal_df = pd.read_csv(args.csv)
35+
36+
# Plotting configuration
37+
sns.set(font_scale=1)
38+
sns.set_style("white")
39+
fig, ax = plt.subplots(1, 1, figsize=(args.figwidth, args.figheight))
40+
x_feature, y_feature = 'time', 'signal'
41+
42+
# 1. Unannotated part (ann == -2)
43+
df_neg2 = raw_signal_df[raw_signal_df['ann'] == -2]
44+
if not df_neg2.empty:
45+
sns.scatterplot(data=df_neg2, x=x_feature, y=y_feature, color='blue',
46+
label='unannotated part', s=50, zorder=4, ax=ax)
47+
48+
# 2. Trimmed primer/adapter (ann == -1)
49+
df_neg1 = raw_signal_df[raw_signal_df['ann'] == -1]
50+
if not df_neg1.empty:
51+
sns.lineplot(data=df_neg1, x=x_feature, y=y_feature, color='green',
52+
label='trimmed primer and adapter', zorder=2, ax=ax)
53+
54+
# 3. Basecalled region (ann is 0 or 1)
55+
df_base = raw_signal_df[raw_signal_df['ann'].isin([0, 1])]
56+
if not df_base.empty:
57+
sns.lineplot(data=df_base, x=x_feature, y=y_feature, color='orange',
58+
label='basecalled region', zorder=3, ax=ax)
59+
60+
# 4. Poly-A Tail Region (polyA > -1)
61+
# We highlight the region identified by the 'pa' tag anchors
62+
df_polya = raw_signal_df[raw_signal_df['polyA'] > -1]
63+
if not df_polya.empty:
64+
# We use a distinct color (magenta) to show the estimated tail
65+
sns.lineplot(data=df_polya, x=x_feature, y=y_feature, color='magenta',
66+
label='polyA-tail region', zorder=5, linewidth=2, ax=ax)
67+
68+
# 5. Samples that emit bases (ann == 1) - Red Circles
69+
df_emit = raw_signal_df[raw_signal_df['ann'] == 1]
70+
if not df_emit.empty:
71+
sns.scatterplot(data=df_emit, x=x_feature, y=y_feature, color='red',
72+
label='samples that emit bases', s=50, fc="none", ec='red', zorder=6, ax=ax)
73+
74+
ax.set(title=args.title)
75+
76+
# Legend placement
77+
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
78+
plt.tight_layout()
79+
plt.savefig(args.output, dpi=300)
80+
plt.close()
81+
82+
if __name__ == "__main__":
83+
main()

bin/pod5_to_df.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#!/usr/bin/env python3
2+
import pod5 as p5
3+
import pandas as pd
4+
import numpy as np
5+
import pysam
6+
import argparse
7+
8+
def main():
9+
parser = argparse.ArgumentParser()
10+
parser.add_argument('--pod5', required=True)
11+
parser.add_argument('--bam', required=True)
12+
parser.add_argument('--sample_id', required=True)
13+
args = parser.parse_args()
14+
15+
# 1. Load BAM data including Poly-A estimation tags
16+
bam_data = {}
17+
with pysam.AlignmentFile(args.bam, "rb", check_sq=False) as bam:
18+
for read in bam.fetch(until_eof=True):
19+
bam_data[read.query_name] = {
20+
'seq': read.query_sequence,
21+
'mv': read.get_tag("mv") if read.has_tag("mv") else None,
22+
'ts': read.get_tag("ts") if read.has_tag("ts") else 0,
23+
'ns': read.get_tag("ns") if read.has_tag("ns") else 0,
24+
'pt': read.get_tag("pt") if read.has_tag("pt") else 0, # Poly-A length
25+
'pa': read.get_tag("pa") if read.has_tag("pa") else None # Signal anchors
26+
}
27+
28+
with p5.Reader(args.pod5) as reader:
29+
for read_record in reader.reads():
30+
read_id = str(read_record.read_id)
31+
if read_id not in bam_data:
32+
continue
33+
34+
# Signal extraction
35+
signal = read_record.signal
36+
sample_rate = read_record.run_info.sample_rate
37+
time = np.arange(len(signal)) / sample_rate
38+
raw_signal_df = pd.DataFrame({'time': time, 'signal': signal})
39+
40+
info = bam_data[read_id]
41+
42+
# Initialize polyA column (default to -1 for non-tail region)
43+
raw_signal_df['polyA'] = -1
44+
45+
# 2. Map Poly-A Region if tags exist
46+
if info['pa'] is not None:
47+
# pa array indices: 1 = start of polyA, 2 = end of polyA
48+
pa_start = info['pa'][1]
49+
pa_end = info['pa'][2]
50+
51+
# Fill the polyA column with the estimated length (pt) for that region
52+
# This makes it easy to filter/color the tail in plots
53+
raw_signal_df.loc[pa_start:pa_end, 'polyA'] = info['pt']
54+
55+
# 3. Handle Moves and Base Mapping
56+
if info['mv'] is not None:
57+
stride = info['mv'][0]
58+
moves = info['mv'][1:]
59+
sequence = info['seq']
60+
61+
# Build 'ann' (moves) array
62+
a = info['ts'] * [-1]
63+
for elem in moves:
64+
a += [elem] * stride
65+
a += (len(raw_signal_df) - info['ns']) * [-1]
66+
a = [-2] * max((len(raw_signal_df) - len(a)), 0) + a
67+
68+
# Trim or pad 'a' to match signal length exactly
69+
if len(a) > len(raw_signal_df):
70+
a = a[:len(raw_signal_df)]
71+
else:
72+
a += [-1] * (len(raw_signal_df) - len(a))
73+
74+
raw_signal_df['ann'] = a
75+
76+
# 4. Map Nucleotides to Signal
77+
base_labels = ['N'] * len(raw_signal_df)
78+
seq_idx = 0
79+
seq_len = len(sequence)
80+
81+
for i, val in enumerate(a):
82+
if val == 1 and seq_idx < seq_len:
83+
current_base = sequence[seq_idx]
84+
base_labels[i] = current_base
85+
seq_idx += 1
86+
elif val == 0 and seq_idx > 0:
87+
base_labels[i] = sequence[seq_idx - 1]
88+
89+
raw_signal_df['base'] = base_labels
90+
91+
# Save annotated CSV
92+
output_name = f"{args.sample_id}_{read_id}_mapped.csv"
93+
raw_signal_df.to_csv(output_name, index=False)
94+
95+
if __name__ == "__main__":
96+
main()

main.nf

Lines changed: 118 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,33 @@ nextflow.enable.dsl = 2
55
if( !params.tsv ) { exit 1, "Please provide the input TSV file with --tsv" }
66

77
workflow {
8-
9-
// Using lowercase 'channel' avoids the ConfigObject error
108
samples_ch = channel
119
.fromPath(params.tsv)
1210
.splitCsv(header: true, sep: '\t')
13-
.map { row ->
14-
if (!row.sample_id || !row.pod5) {
15-
error "TSV missing required columns 'sample_id' or 'pod5'"
16-
}
17-
tuple(row.sample_id, file(row.pod5))
18-
}
11+
.map { row -> tuple(row.sample_id, file(row.pod5)) }
1912

13+
// 1. Initial basecall
2014
dorado_basecall(samples_ch)
15+
16+
// 2. Extract random IDs
17+
extract_read_ids(dorado_basecall.out.bam)
18+
19+
// 3. Create subset POD5
20+
filter_input_ch = samples_ch.join(extract_read_ids.out.ids_file)
21+
22+
filter_pod5(filter_input_ch)
23+
24+
// 4. Re-run Dorado for moves (Needs the POD5 subset)
25+
dorado_emit_moves(filter_pod5.out)
26+
27+
// Join POD5 and BAM before generating CSV
28+
// filter_pod5.out is [sample_id, pod5]
29+
// dorado_emit_moves.out.bam is [sample_id, bam]
30+
final_input_ch = filter_pod5.out.join(dorado_emit_moves.out.bam)
31+
32+
generate_signal_df(final_input_ch)
33+
34+
visualize_signal(generate_signal_df.out.flatten())
2135
}
2236

2337
process dorado_basecall {
@@ -51,4 +65,99 @@ process dorado_basecall {
5165
# Index the resulting BAM
5266
samtools index -@ ${task.cpus} \$OUT_BAM
5367
"""
54-
}
68+
}
69+
70+
process extract_read_ids {
71+
tag "${sample_id}"
72+
input:
73+
tuple val(sample_id), path(bam)
74+
75+
output:
76+
tuple val(sample_id), path("${sample_id}_read_ids.txt"), emit: ids_file
77+
78+
script:
79+
"""
80+
samtools view ${bam} | cut -f1 | sort -u | shuf -n ${params.num_reads} > ${sample_id}_read_ids.txt
81+
"""
82+
}
83+
84+
process filter_pod5 {
85+
tag "${sample_id}"
86+
publishDir "${params.outdir}/subsampled_pod5", mode: 'copy'
87+
88+
input:
89+
// This matches the output of the .join()
90+
tuple val(sample_id), path(original_pod5), path(read_ids_txt)
91+
92+
output:
93+
tuple val(sample_id), path("${sample_id}.subset.pod5")
94+
95+
script:
96+
"""
97+
pod5 filter ${original_pod5} --output ${sample_id}.subset.pod5 --ids ${read_ids_txt} --force-overwrite
98+
"""
99+
}
100+
101+
process dorado_emit_moves {
102+
tag "${sample_id}"
103+
publishDir "${params.outdir}/moves_bam", mode: 'copy'
104+
105+
input:
106+
tuple val(sample_id), path(subset_pod5)
107+
108+
output:
109+
tuple val(sample_id), path("${sample_id}.moves.bam"), emit: bam
110+
111+
script:
112+
"""
113+
# Pipe Dorado output to samtools to ensure a valid, compressed BAM with header
114+
${params.dorado} basecaller \\
115+
${params.model} \\
116+
${subset_pod5} \\
117+
--emit-moves \\
118+
--estimate-poly-a \\
119+
--poly-a-config ${params.polyA} \\
120+
--mm2-opts "-x splice -Y" \\
121+
--reference ${params.ref} \\
122+
--device "cuda:all" | samtools view -bS - > ${sample_id}.moves.bam
123+
"""
124+
}
125+
126+
process generate_signal_df {
127+
tag "${sample_id}"
128+
publishDir "${params.outdir}/annotated_data", mode: 'copy'
129+
130+
input:
131+
tuple val(sample_id), path(subset_pod5), path(moves_bam)
132+
133+
output:
134+
path "*.csv"
135+
136+
script:
137+
"""
138+
pod5_to_df.py --pod5 ${subset_pod5} --bam ${moves_bam} --sample_id ${sample_id}
139+
"""
140+
}
141+
142+
process visualize_signal {
143+
tag "${csv.baseName}"
144+
publishDir "${params.outdir}/plots", mode: 'copy'
145+
146+
input:
147+
path csv
148+
149+
output:
150+
path "*.pdf"
151+
152+
script:
153+
def read_id = csv.baseName.replace("_mapped", "")
154+
"""
155+
plot_signal.py \\
156+
--csv ${csv} \\
157+
--output ${read_id}.pdf \\
158+
--title "Read: ${read_id}" \\
159+
--figwidth ${params.figwidth} \\
160+
--figheight ${params.figheight}
161+
"""
162+
}
163+

nextflow.config

Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,27 +13,42 @@ params {
1313
model = "${myHome}/nanoflowz/rna004_130bps_sup@v5.1.0"
1414
polyA = "${myHome}/materials/polya_config.toml"
1515
ref = "${myHome}/nanoflowz/GRCm38.primary_assembly.genome.fa"
16+
17+
num_reads = 10 // Default number of read IDs to sample
18+
19+
// Plot configuration
20+
figwidth = 15.0 // Figure width in inches
21+
figheight = 3.2 // Figure height in inches
1622
}
1723

1824
process {
1925
executor = 'slurm'
20-
queue = 'a100'
21-
22-
// Resource requests
23-
cpus = 6
24-
memory = '40 GB'
25-
time = '29m'
26-
27-
// GPU and Cluster specific options
28-
clusterOptions = '--gres=gpu:1 --qos=a100-1day'
29-
30-
// Conda integration
31-
// Using the path to the environment directly is most efficient on Slurm
32-
conda = params.condaEnv
33-
34-
// This ensures the compute node shell can find the 'conda' command
35-
// to perform the internal environment activation
36-
beforeScript = "pwd; source ${myHome}/miniconda3/etc/profile.d/conda.sh"
26+
conda = params.condaEnv
27+
beforeScript = """
28+
source ${myHome}/miniconda3/etc/profile.d/conda.sh
29+
export LD_LIBRARY_PATH=${myHome}/miniconda3/envs/nanopore/lib:\$LD_LIBRARY_PATH
30+
"""
31+
32+
// Default resources for CPU-only processes
33+
// (extract_read_ids, filter_pod5, generate_signal_df)
34+
cpus = 1
35+
memory = '4 GB'
36+
time = '20m'
37+
38+
// Specific overrides for ONLY the basecaller
39+
withName: 'dorado_basecall|dorado_emit_moves' {
40+
queue = 'a100'
41+
cpus = 6
42+
memory = '40 GB'
43+
time = '29m'
44+
clusterOptions = '--gres=gpu:1 --qos=a100-1day'
45+
}
46+
47+
withName: visualize_signal {
48+
cpus = 1
49+
memory = '8 GB' // Seaborn can be hungry with 100k+ points
50+
time = '15m'
51+
}
3752
}
3853

3954
conda {

0 commit comments

Comments
 (0)