Skip to content

Commit d33dc2b

Browse files
committed
subsample_bam
1 parent 19d96f0 commit d33dc2b

File tree

8 files changed

+327
-234
lines changed

8 files changed

+327
-234
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file.
44
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

7+
## [v0.3.14] - 2023-10-25
8+
### Changed
9+
- `subsample_bam` and `coverage_from_bam` now have unified read filtering options and logic.
10+
### Added
11+
- `filter_bam` to filter a bam with the same logic used in `subsample_bam`.
12+
### Fixed
13+
- `subsample_bam` was previously subsampling proportionally before filtering resulting in lower than expected depth.
14+
715
## [v0.3.13] - 2023-06-23
816
### Changed
917
- `subsample_bam`: `--force_low_coverage` saves contigs with coverage below the target

pomoxis/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = '0.3.13'
1+
__version__ = '0.3.14'
22

33
import argparse
44
import os

pomoxis/coverage_from_bam.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
import argparse
2+
import functools
23
import logging
3-
import numpy as np
44
import os
5+
6+
import numpy as np
57
import pandas as pd
68
import pysam
7-
from pomoxis.util import parse_regions, Region
89

10+
from pomoxis.util import filter_args, filter_read, parse_regions, Region
911

10-
def coverage_of_region(region, bam_fp, stride, primary_only=False):
12+
13+
def coverage_of_region(region, bam_fp, stride, read_filter_func):
1114
"""Get coverage data for a region"""
1215

1316
bins = np.arange(region.start, region.end, stride)
@@ -16,7 +19,7 @@ def coverage_of_region(region, bam_fp, stride, primary_only=False):
1619
coverage_by_is_rev = {True: np.zeros(len(bins)), False: np.zeros(len(bins))}
1720
for r_obj in pysam.AlignmentFile(bam_fp).fetch(contig=region.ref_name, start=region.start, end=region.end):
1821
# Ignore secondary/supplementary maps when computing the median depth, if requested
19-
if primary_only and (r_obj.is_secondary or r_obj.is_supplementary):
22+
if read_filter_func(r_obj):
2023
continue
2124
start_i = max((r_obj.reference_start - bins[0]) // stride, 0)
2225
end_i = min((r_obj.reference_end - bins[0]) // stride, len(bins))
@@ -39,24 +42,25 @@ def coverage_summary_of_region(*args, **kwargs):
3942

4043
def main():
4144
logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO)
45+
4246
parser = argparse.ArgumentParser(
4347
prog='coverage_from_bam',
4448
description='Calculate read coverage depth from a bam.',
4549
epilog='By default a file is written per reference sequence, this can be changed with the `--one_file` '
4650
'option. If overlapping regions are specified, `--one_file` should not be used.',
51+
parents=[filter_args()],
4752
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
4853
parser.add_argument('bam', help='.fasta/fastq file.')
4954
parser.add_argument('-r', '--regions', nargs='+', help='Only process given regions.')
5055
grp = parser.add_mutually_exclusive_group()
5156
grp.add_argument('-p', '--prefix', help='Prefix for output, defaults to basename of bam.')
5257
grp.add_argument('-o', '--one_file', help='Single output file with "region" column.')
5358
parser.add_argument('-s', '--stride', type=int, default=1000, help='Stride in genomic coordinate.')
54-
parser.add_argument('--primary_only', action='store_true',
55-
help='Use only primary reads when computing the coverage.')
5659
parser.add_argument('--summary_only', action='store_true',
5760
help='Output only the depth_summary.txt file')
5861

5962
args = parser.parse_args()
63+
read_filter_func = functools.partial(filter_read, args=args)
6064

6165
bam = pysam.AlignmentFile(args.bam)
6266
ref_lengths = dict(zip(bam.references, bam.lengths))
@@ -76,7 +80,7 @@ def main():
7680
prefix = os.path.splitext(os.path.basename(args.bam))[0]
7781

7882
region_str = '{}_{}_{}'.format(region.ref_name, region.start, region.end)
79-
df = coverage_of_region(region, args.bam, args.stride, primary_only=args.primary_only)
83+
df = coverage_of_region(region, args.bam, args.stride, read_filter_func)
8084
summary[region_str] = df['depth'].describe()
8185

8286
if not args.summary_only:

pomoxis/filter_bam.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import argparse
2+
import logging
3+
4+
import pysam
5+
6+
from pomoxis.util import filter_args, filter_read, parse_regions, Region
7+
8+
9+
def main():
10+
logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO)
11+
parser = argparse.ArgumentParser(
12+
prog='filter_bam',
13+
description='Filter a bam',
14+
parents=[filter_args()],
15+
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
16+
parser.add_argument('bam',
17+
help='input bam file.')
18+
parser.add_argument('-o', '--output_bam', default='filtered.bam',
19+
help='Output bam file.')
20+
parser.add_argument('-r', '--region',
21+
help='Only process given region.')
22+
23+
args = parser.parse_args()
24+
logger = logging.getLogger('filter')
25+
26+
with pysam.AlignmentFile(args.bam) as bam:
27+
28+
if args.region is not None:
29+
ref_lengths = dict(zip(bam.references, bam.lengths))
30+
region = parse_regions([args.region], ref_lengths=ref_lengths)[0]
31+
reads = bam.fetch(region.ref_name, region.start, region.end)
32+
else:
33+
reads = bam
34+
out_bam = pysam.AlignmentFile(args.output_bam, "wb", header=bam.header)
35+
for read in reads:
36+
if filter_read(read, args, logger):
37+
continue
38+
out_bam.write(read)
39+
out_bam.close()
40+
pysam.index(args.output_bam)

pomoxis/stats_from_bam.py

Lines changed: 3 additions & 148 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
import pysam
1414

15-
from pomoxis.util import intervaltrees_from_bed
15+
from pomoxis.util import intervaltrees_from_bed, masked_stats_from_aligned_read, stats_from_aligned_read
1616

1717
parser = argparse.ArgumentParser(
1818
prog='stats_from_bam',
@@ -32,151 +32,6 @@
3232
help='Number of threads for parallel processing.')
3333

3434

35-
def stats_from_aligned_read(read, references, lengths):
36-
"""Create summary information for an aligned read
37-
38-
:param read: :class:`pysam.AlignedSegment` object
39-
"""
40-
try:
41-
NM = read.get_tag('NM')
42-
except:
43-
raise IOError("Read is missing required 'NM' tag. Try running 'samtools fillmd -S - ref.fa'.")
44-
45-
name = read.qname
46-
start_offset = 0
47-
if read.is_secondary or read.is_supplementary:
48-
first_cig = read.cigartuples[0]
49-
if first_cig[0] == 5: # should always be true for minimap2
50-
start_offset = first_cig[1]
51-
counts, _ = read.get_cigar_stats()
52-
match = counts[0] + counts[7] + counts[8] # total of M, =, and X
53-
ins = counts[1]
54-
delt = counts[2]
55-
56-
lra_flag = False
57-
if read.has_tag('NX'):
58-
# likely from lra
59-
# NM is number of matches, see https://github.com/ChaissonLab/LRA/issues/32
60-
sub = counts[8]
61-
lra_flag = True
62-
else:
63-
# likely from minimap2
64-
# NM is edit distance: NM = INS + DEL + SUB
65-
sub = NM - ins - delt
66-
67-
length = match + ins + delt
68-
iden = 100 * float(match - sub) / match
69-
acc = 100 * float(match - sub) / length
70-
71-
read_length = read.infer_read_length()
72-
coverage = 100 * float(read.query_alignment_length) / read_length
73-
direction = '-' if read.is_reverse else '+'
74-
75-
results = {
76-
"name": name,
77-
"coverage": coverage,
78-
"direction": direction,
79-
"length": length,
80-
"read_length": read_length,
81-
"match": match,
82-
"ins": ins,
83-
"del": delt,
84-
"sub": sub,
85-
"iden": iden,
86-
"acc": acc,
87-
"qstart": read.query_alignment_start + start_offset,
88-
"qend": read.query_alignment_end + start_offset,
89-
"rstart": read.reference_start,
90-
"rend": read.reference_end,
91-
"ref": references[read.reference_id],
92-
"aligned_ref_len": read.reference_length,
93-
"ref_coverage": 100*float(read.reference_length) / lengths[read.reference_id],
94-
"mapq": read.mapping_quality,
95-
}
96-
97-
return results, lra_flag
98-
99-
100-
def masked_stats_from_aligned_read(read, references, lengths, tree):
101-
"""Create summary information for an aligned read over regions in bed file.
102-
103-
:param read: :class:`pysam.AlignedSegment` object
104-
"""
105-
try:
106-
MD = read.get_tag('MD')
107-
except:
108-
raise IOError("Read is missing required 'MD' tag. Try running 'samtools callmd - ref.fa'.")
109-
110-
correct, delt, ins, sub, aligned_ref_len, masked = 0, 0, 0, 0, 0, 0
111-
pairs = read.get_aligned_pairs(with_seq=True)
112-
qseq = read.query_sequence
113-
pos_is_none = lambda x: (x[1] is None or x[0] is None)
114-
pos = None
115-
insertions = []
116-
# TODO: refactor to use get_trimmed_pairs (as in catalogue_errors)?
117-
for qp, rp, rb in itertools.dropwhile(pos_is_none, pairs):
118-
if rp == read.reference_end or (qp == read.query_alignment_end):
119-
break
120-
pos = rp if rp is not None else pos
121-
if not tree.has_overlap(pos, pos + 1) or (rp is None and not tree.has_overlap(pos + 1, pos + 2)):
122-
# if rp is None, we are in an insertion, check if pos + 1 overlaps
123-
# (ref position of ins is arbitrary)
124-
# print('Skipping ref {}:{}'.format(read.reference_name, pos))
125-
masked += 1
126-
continue
127-
else:
128-
if rp is not None: # we don't have an insertion
129-
aligned_ref_len += 1
130-
if qp is None: # deletion
131-
delt += 1
132-
elif qseq[qp] == rb: # correct
133-
correct += 1
134-
elif qseq[qp] != rb: # sub
135-
sub += 1
136-
else: # we have an insertion
137-
ins += 1
138-
139-
name = read.qname
140-
match = correct + sub
141-
length = match + ins + delt
142-
143-
if match == 0:
144-
# no matches within bed regions - all bed ref positions were deleted.
145-
# skip this alignment.
146-
return None
147-
148-
iden = 100 * float(match - sub) / match
149-
acc = 100 - 100 * float(sub + ins + delt) / length
150-
151-
read_length = read.infer_read_length()
152-
masked_query_alignment_length = correct + sub + ins
153-
coverage = 100*float(masked_query_alignment_length) / read_length
154-
direction = '-' if read.is_reverse else '+'
155-
156-
results = {
157-
"name": name,
158-
"coverage": coverage,
159-
"direction": direction,
160-
"length": length,
161-
"read_length": read_length,
162-
"match": match,
163-
"ins": ins,
164-
"del": delt,
165-
"sub": sub,
166-
"iden": iden,
167-
"acc": acc,
168-
"qstart": read.query_alignment_start,
169-
"qend": read.query_alignment_end,
170-
"rstart": read.reference_start,
171-
"rend": read.reference_end,
172-
"ref": references[read.reference_id],
173-
"aligned_ref_len": aligned_ref_len,
174-
"ref_coverage": 100 * float(aligned_ref_len) / lengths[read.reference_id],
175-
"mapq": read.mapping_quality,
176-
"masked": masked,
177-
}
178-
return results
179-
18035

18136
def _process_reads(bam_fp, start_stop, all_alignments=False, min_length=None, bed_file=None):
18237
start, stop = start_stop
@@ -205,12 +60,12 @@ def _process_reads(bam_fp, start_stop, all_alignments=False, min_length=None, be
20560
counts['masked'] += 1
20661
continue
20762
else:
208-
result = masked_stats_from_aligned_read(read, bam.references, bam.lengths, trees[read.reference_name])
63+
result = masked_stats_from_aligned_read(read, trees[read.reference_name])
20964
if result is None: # no matches within bed regions
21065
counts['all_matches_masked'] += 1
21166
continue
21267
else:
213-
result, lra_flag = stats_from_aligned_read(read, bam.references, bam.lengths)
68+
result, lra_flag = stats_from_aligned_read(read)
21469

21570
if min_length is not None and result['length'] < min_length:
21671
counts['short'] += 1

0 commit comments

Comments
 (0)