Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions bin/aggregate_read_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env python3
import argparse
import pandas as pd

def main():
parser = argparse.ArgumentParser(description="Aggregate read counts from Nextflow chunks")
parser.add_argument('--input', nargs='+', required=True, help="List of chunk TSV files")
parser.add_argument('--output', required=True, help="Output aggregated TSV")
args = parser.parse_args()

df_list = []
for f in args.input:
df = pd.read_csv(f, sep='\t', names=["sample_id", "step", "total_reads", "unmapped_reads", "um_reads", "mm_reads"])
df_list.append(df)

combined = pd.concat(df_list)

# Group by sample and step, then sum the chunks
agg = combined.groupby(["sample_id", "step"]).sum().reset_index()

# Sort alphabetically (our step names will start with 01_, 02_, etc.)
agg = agg.sort_values(["sample_id", "step"])

agg.to_csv(args.output, sep='\t', index=False)

if __name__ == "__main__":
main()
80 changes: 80 additions & 0 deletions bin/compute_polya_bedgraphs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/env python3
import pysam
import argparse
import sys
import numpy as np
from collections import defaultdict

def weighted_mean(pt_list, nh_list):
weights = 1.0 / np.array(nh_list)
return np.average(pt_list, weights=weights)

def weighted_median(pt_list, nh_list):
# Sort points and weights
sorted_indices = np.argsort(pt_list)
pts = np.array(pt_list)[sorted_indices]
weights = (1.0 / np.array(nh_list))[sorted_indices]

cumsum = np.cumsum(weights)
cutoff = np.sum(weights) / 2.0
return pts[cumsum >= cutoff][0]

def main():
parser = argparse.ArgumentParser(description="Calculate weighted PolyA length bedgraphs.")
parser.add_argument("--bam_in", required=True)
parser.add_argument("--tag_cs", required=True)
parser.add_argument("--metric", choices=['mean', 'median'], required=True)
parser.add_argument("--include_multimappers", type=str, default="true")
args = parser.parse_args()

include_mm = args.include_multimappers.lower() == 'true'

# Dictionaries to accumulate (pt, NH) lists by genomic coordinate
# Keys: (chrom, pos)
plus_data = defaultdict(lambda: ({'pt': [], 'nh': []}))
minus_data = defaultdict(lambda: ({'pt': [], 'nh': []}))

with pysam.AlignmentFile(args.bam_in, "rb") as bam:
for r in bam:
if r.is_unmapped:
continue

# Filter multi-mappers if requested
if not include_mm and r.mapping_quality != 255:
continue

try:
cs_1based = r.get_tag(args.tag_cs)
pt_len = r.get_tag("pt")
nh = r.get_tag("NH") if r.has_tag("NH") else 1

if pt_len > 0:
cs_0based = cs_1based - 1
chrom = r.reference_name

if r.is_reverse:
minus_data[(chrom, cs_0based)]['pt'].append(pt_len)
minus_data[(chrom, cs_0based)]['nh'].append(nh)
else:
plus_data[(chrom, cs_0based)]['pt'].append(pt_len)
plus_data[(chrom, cs_0based)]['nh'].append(nh)
except KeyError:
pass

# Function to write data to bedgraph
def write_bg(data_dict, filename):
with open(filename, "w") as f:
for (chrom, pos), vals in data_dict.items():
if args.metric == 'mean':
val = weighted_mean(vals['pt'], vals['nh'])
else:
val = weighted_median(vals['pt'], vals['nh'])

# BedGraph format: chrom start end value
f.write(f"{chrom}\t{pos}\t{pos+1}\t{val:.2f}\n")

write_bg(plus_data, "plus.bg")
write_bg(minus_data, "minus.bg")

if __name__ == "__main__":
main()
21 changes: 21 additions & 0 deletions bin/count_reads.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash

SAMPLE=$1
STEP=$2
BAM=$3
CPUS=$4
OUT=$5

# Extract read name and whether it mapped to a chromosome (RNAME != *)
samtools view -@ $CPUS -F 2048 $BAM | awk '{if($3=="*") print $1"\tUN"; else print $1"\tMA"}' | sort -S 2G | uniq -c > counts.tmp

# Sum up the occurrences
UNMAPPED=$(awk '$3=="UN" {sum++} END {print sum+0}' counts.tmp)
UM=$(awk '$1==1 && $3=="MA" {sum++} END {print sum+0}' counts.tmp)
MM=$(awk '$1>1 && $3=="MA" {sum++} END {print sum+0}' counts.tmp)
TOTAL=$((UNMAPPED + UM + MM))

# Output as TSV row
echo -e "${SAMPLE}\t${STEP}\t${TOTAL}\t${UNMAPPED}\t${UM}\t${MM}" > $OUT

rm counts.tmp
38 changes: 38 additions & 0 deletions bin/extract_bam_tags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env python3
import pysam
import argparse
import gzip

def main():
parser = argparse.ArgumentParser(description="Extract BAM tags to TSV.")
parser.add_argument("--bam_in", required=True)
parser.add_argument("--tsv_out", required=True)
parser.add_argument("--tags", nargs='+', required=True)
parser.add_argument("--include_multimappers", type=str, default="true")
args = parser.parse_args()

include_mm = args.include_multimappers.lower() == 'true'

with pysam.AlignmentFile(args.bam_in, "rb") as bam, gzip.open(args.tsv_out, "wt") as f:
f.write("read_id\tchromosome\t" + "\t".join(args.tags) + "\n")

for r in bam:
if r.is_unmapped:
continue

if not include_mm and r.mapping_quality != 255:
continue

chrom = r.reference_name if r.reference_name else "NA"
row = [r.query_name, chrom]

for tag in args.tags:
try:
val = str(r.get_tag(tag))
except KeyError:
val = "NA"
row.append(val)
f.write("\t".join(row) + "\n")

if __name__ == "__main__":
main()
56 changes: 30 additions & 26 deletions bin/extract_cs_bigwig.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,41 @@
#!/usr/bin/env python3

import pysam
import argparse
import sys

def main():
parser = argparse.ArgumentParser(description="Extract 1-based cleavage sites from a BAM custom tag and convert to 0-based BED format.")
parser.add_argument("--bam_in", required=True, help="Input BAM file with uniquely mapped reads")
parser.add_argument("--bed_out", required=True, help="Output BED file")
parser.add_argument("--tag", required=True, help="The SAM tag storing the 1-based cleavage site (e.g., XF or XO)")
parser.add_argument("--MAPQ_min", type=int, required=False, default=255, help="minimal MAPQ for an alignment to be included")
parser = argparse.ArgumentParser(description="Extract 1-based cleavage sites to BED format with 1/NH weights.")
parser.add_argument("--bam_in", required=True)
parser.add_argument("--bed_out", required=True)
parser.add_argument("--tag", required=True)
parser.add_argument("--include_multimappers", type=str, default="true")
args = parser.parse_args()

include_mm = args.include_multimappers.lower() == 'true'

with pysam.AlignmentFile(args.bam_in, "rb") as bam, open(args.bed_out, "w") as f:
for r in bam:
# MAPQ 255 represents unique alignments in your pipeline
if r.mapping_quality >= args.MAPQ_min and not r.is_unmapped:
try:
cs_1based = r.get_tag(args.tag)
# Convert 1-based tag coordinate to 0-based BED interval (length = 1)
start = cs_1based - 1
end = cs_1based
strand = "-" if r.is_reverse else "+"

# Write in BED6 format: chrom, start, end, name, score, strand
f.write(f"{r.reference_name}\t{start}\t{end}\t{r.query_name}\t0\t{strand}\n")
except KeyError:
# Skip reads that are missing the designated cleavage site tag
pass
if r.is_unmapped:
continue

# Filter multi-mappers if requested
if not include_mm and r.mapping_quality != 255:
continue

try:
cs_1based = r.get_tag(args.tag)
# Fetch NH tag, defaulting to 1 if it somehow went missing
nh = r.get_tag("NH") if r.has_tag("NH") else 1
weight = 1.0 / nh

# Convert 1-based tag coordinate to 0-based BED interval (length = 1)
start = cs_1based - 1
end = cs_1based
strand = "-" if r.is_reverse else "+"

# Write in BED6 format: chrom, start, end, name, score (weight), strand
f.write(f"{r.reference_name}\t{start}\t{end}\t{r.query_name}\t{weight:.4f}\t{strand}\n")
except KeyError:
pass

if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
sys.stderr.write("User interrupt!\n")
sys.exit(1)
main()
122 changes: 122 additions & 0 deletions bin/plot_polya_distributions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/usr/bin/env python3
import argparse
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import re

def natural_sort_key(s):
parsed = re.split('([0-9]+)', str(s))
return [(0, int(text)) if text.isdigit() else (1, text.lower()) for text in parsed]

def weighted_mean(group):
weights = 1.0 / group['NH']
return np.average(group['pt'], weights=weights)

def weighted_median(group):
df_sorted = group.sort_values('pt')
weights = 1.0 / df_sorted['NH']
cumsum = weights.cumsum()
cutoff = weights.sum() / 2.0
# Return the first pt value where cumulative weight >= 50%
return df_sorted[cumsum >= cutoff]['pt'].iloc[0]

def main():
parser = argparse.ArgumentParser()
parser.add_argument('--input', nargs='+', required=True)
parser.add_argument('--output_prefix', required=True)
args = parser.parse_args()

df_list = []
for f in args.input:
sample_id = os.path.basename(f).split('.tags.tsv')[0]
df = pd.read_csv(f, sep='\t', compression='gzip')
df['sample_id'] = sample_id
df_list.append(df)

combined = pd.concat(df_list, ignore_index=True)

# Clean data
combined['pt'] = pd.to_numeric(combined['pt'], errors='coerce')
combined['NH'] = pd.to_numeric(combined['NH'], errors='coerce').fillna(1)
valid_pt = combined.dropna(subset=['pt'])
valid_pt = valid_pt[valid_pt['pt'] > 0]

# Process Chromosomes
top_chrs = valid_pt['chromosome'].value_counts().nlargest(30).index
unique_top_chrs = [c for c in valid_pt['chromosome'].unique() if c in top_chrs]
sorted_chrs = sorted(unique_top_chrs, key=natural_sort_key)

valid_pt = valid_pt[valid_pt['chromosome'].isin(sorted_chrs)]
valid_pt['chromosome'] = pd.Categorical(valid_pt['chromosome'], categories=sorted_chrs, ordered=True)

sns.set_theme(style="whitegrid")

# ==========================================
# PLOT 1: All Reads (Deduplicated to avoid MM overcounting)
# ==========================================
unique_reads = valid_pt.drop_duplicates(subset=['sample_id', 'read_id'])

plt.figure(figsize=(10, 6))
sns.boxplot(data=unique_reads, x='sample_id', y='pt', showfliers=False)
plt.title('PolyA Tail Length (All Reads)')
plt.xticks(rotation=45, ha='right'); plt.tight_layout()
plt.savefig(f"{args.output_prefix}_all_reads_pooled.pdf")
plt.close()

plt.figure(figsize=(16, 6))
sns.boxplot(data=unique_reads, x='sample_id', y='pt', hue='chromosome', showfliers=False)
plt.title('PolyA Tail Length (All Reads, per Chromosome)')
plt.xticks(rotation=45, ha='right'); plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left')
plt.tight_layout()
plt.savefig(f"{args.output_prefix}_all_reads_per_chr.pdf")
plt.close()

# ==========================================
# Filter for Gene-Assigned Reads
# ==========================================
gene_assigned = valid_pt[~valid_pt['XT'].isna() & (valid_pt['XT'] != 'NA')]

# ==========================================
# PLOT 2 & 3: Per-Gene Weighted Math
# ==========================================
# Calculate Weighted Median
gene_median = gene_assigned.groupby(['sample_id', 'chromosome', 'XT'], observed=True).apply(weighted_median).reset_index(name='pt')

plt.figure(figsize=(10, 6))
sns.boxplot(data=gene_median, x='sample_id', y='pt', showfliers=False)
plt.title('PolyA Tail Length (Per-Gene Median)')
plt.xticks(rotation=45, ha='right'); plt.tight_layout()
plt.savefig(f"{args.output_prefix}_per_gene_median_pooled.pdf")
plt.close()

plt.figure(figsize=(16, 6))
sns.boxplot(data=gene_median, x='sample_id', y='pt', hue='chromosome', showfliers=False)
plt.title('PolyA Tail Length (Per-Gene Median per Chromosome)')
plt.xticks(rotation=45, ha='right'); plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left')
plt.tight_layout()
plt.savefig(f"{args.output_prefix}_per_gene_median_per_chr.pdf")
plt.close()

# Calculate Weighted Mean
gene_mean = gene_assigned.groupby(['sample_id', 'chromosome', 'XT'], observed=True).apply(weighted_mean).reset_index(name='pt')

plt.figure(figsize=(10, 6))
sns.boxplot(data=gene_mean, x='sample_id', y='pt', showfliers=False)
plt.title('PolyA Tail Length (Per-Gene Mean)')
plt.xticks(rotation=45, ha='right'); plt.tight_layout()
plt.savefig(f"{args.output_prefix}_per_gene_mean_pooled.pdf")
plt.close()

plt.figure(figsize=(16, 6))
sns.boxplot(data=gene_mean, x='sample_id', y='pt', hue='chromosome', showfliers=False)
plt.title('PolyA Tail Length (Per-Gene Mean per Chromosome)')
plt.xticks(rotation=45, ha='right'); plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left')
plt.tight_layout()
plt.savefig(f"{args.output_prefix}_per_gene_mean_per_chr.pdf")
plt.close()

if __name__ == "__main__":
main()
Loading
Loading