|
12 | 12 |
|
13 | 13 | import pysam |
14 | 14 |
|
15 | | -from pomoxis.util import intervaltrees_from_bed |
| 15 | +from pomoxis.util import intervaltrees_from_bed, masked_stats_from_aligned_read, stats_from_aligned_read |
16 | 16 |
|
17 | 17 | parser = argparse.ArgumentParser( |
18 | 18 | prog='stats_from_bam', |
|
32 | 32 | help='Number of threads for parallel processing.') |
33 | 33 |
|
34 | 34 |
|
35 | | -def stats_from_aligned_read(read, references, lengths): |
36 | | - """Create summary information for an aligned read |
37 | | -
|
38 | | - :param read: :class:`pysam.AlignedSegment` object |
39 | | - """ |
40 | | - try: |
41 | | - NM = read.get_tag('NM') |
42 | | - except: |
43 | | - raise IOError("Read is missing required 'NM' tag. Try running 'samtools fillmd -S - ref.fa'.") |
44 | | - |
45 | | - name = read.qname |
46 | | - start_offset = 0 |
47 | | - if read.is_secondary or read.is_supplementary: |
48 | | - first_cig = read.cigartuples[0] |
49 | | - if first_cig[0] == 5: # should always be true for minimap2 |
50 | | - start_offset = first_cig[1] |
51 | | - counts, _ = read.get_cigar_stats() |
52 | | - match = counts[0] + counts[7] + counts[8] # total of M, =, and X |
53 | | - ins = counts[1] |
54 | | - delt = counts[2] |
55 | | - |
56 | | - lra_flag = False |
57 | | - if read.has_tag('NX'): |
58 | | - # likely from lra |
59 | | - # NM is number of matches, see https://github.com/ChaissonLab/LRA/issues/32 |
60 | | - sub = counts[8] |
61 | | - lra_flag = True |
62 | | - else: |
63 | | - # likely from minimap2 |
64 | | - # NM is edit distance: NM = INS + DEL + SUB |
65 | | - sub = NM - ins - delt |
66 | | - |
67 | | - length = match + ins + delt |
68 | | - iden = 100 * float(match - sub) / match |
69 | | - acc = 100 * float(match - sub) / length |
70 | | - |
71 | | - read_length = read.infer_read_length() |
72 | | - coverage = 100 * float(read.query_alignment_length) / read_length |
73 | | - direction = '-' if read.is_reverse else '+' |
74 | | - |
75 | | - results = { |
76 | | - "name": name, |
77 | | - "coverage": coverage, |
78 | | - "direction": direction, |
79 | | - "length": length, |
80 | | - "read_length": read_length, |
81 | | - "match": match, |
82 | | - "ins": ins, |
83 | | - "del": delt, |
84 | | - "sub": sub, |
85 | | - "iden": iden, |
86 | | - "acc": acc, |
87 | | - "qstart": read.query_alignment_start + start_offset, |
88 | | - "qend": read.query_alignment_end + start_offset, |
89 | | - "rstart": read.reference_start, |
90 | | - "rend": read.reference_end, |
91 | | - "ref": references[read.reference_id], |
92 | | - "aligned_ref_len": read.reference_length, |
93 | | - "ref_coverage": 100*float(read.reference_length) / lengths[read.reference_id], |
94 | | - "mapq": read.mapping_quality, |
95 | | - } |
96 | | - |
97 | | - return results, lra_flag |
98 | | - |
99 | | - |
100 | | -def masked_stats_from_aligned_read(read, references, lengths, tree): |
101 | | - """Create summary information for an aligned read over regions in bed file. |
102 | | -
|
103 | | - :param read: :class:`pysam.AlignedSegment` object |
104 | | - """ |
105 | | - try: |
106 | | - MD = read.get_tag('MD') |
107 | | - except: |
108 | | - raise IOError("Read is missing required 'MD' tag. Try running 'samtools callmd - ref.fa'.") |
109 | | - |
110 | | - correct, delt, ins, sub, aligned_ref_len, masked = 0, 0, 0, 0, 0, 0 |
111 | | - pairs = read.get_aligned_pairs(with_seq=True) |
112 | | - qseq = read.query_sequence |
113 | | - pos_is_none = lambda x: (x[1] is None or x[0] is None) |
114 | | - pos = None |
115 | | - insertions = [] |
116 | | - # TODO: refactor to use get_trimmed_pairs (as in catalogue_errors)? |
117 | | - for qp, rp, rb in itertools.dropwhile(pos_is_none, pairs): |
118 | | - if rp == read.reference_end or (qp == read.query_alignment_end): |
119 | | - break |
120 | | - pos = rp if rp is not None else pos |
121 | | - if not tree.has_overlap(pos, pos + 1) or (rp is None and not tree.has_overlap(pos + 1, pos + 2)): |
122 | | - # if rp is None, we are in an insertion, check if pos + 1 overlaps |
123 | | - # (ref position of ins is arbitrary) |
124 | | - # print('Skipping ref {}:{}'.format(read.reference_name, pos)) |
125 | | - masked += 1 |
126 | | - continue |
127 | | - else: |
128 | | - if rp is not None: # we don't have an insertion |
129 | | - aligned_ref_len += 1 |
130 | | - if qp is None: # deletion |
131 | | - delt += 1 |
132 | | - elif qseq[qp] == rb: # correct |
133 | | - correct += 1 |
134 | | - elif qseq[qp] != rb: # sub |
135 | | - sub += 1 |
136 | | - else: # we have an insertion |
137 | | - ins += 1 |
138 | | - |
139 | | - name = read.qname |
140 | | - match = correct + sub |
141 | | - length = match + ins + delt |
142 | | - |
143 | | - if match == 0: |
144 | | - # no matches within bed regions - all bed ref positions were deleted. |
145 | | - # skip this alignment. |
146 | | - return None |
147 | | - |
148 | | - iden = 100 * float(match - sub) / match |
149 | | - acc = 100 - 100 * float(sub + ins + delt) / length |
150 | | - |
151 | | - read_length = read.infer_read_length() |
152 | | - masked_query_alignment_length = correct + sub + ins |
153 | | - coverage = 100*float(masked_query_alignment_length) / read_length |
154 | | - direction = '-' if read.is_reverse else '+' |
155 | | - |
156 | | - results = { |
157 | | - "name": name, |
158 | | - "coverage": coverage, |
159 | | - "direction": direction, |
160 | | - "length": length, |
161 | | - "read_length": read_length, |
162 | | - "match": match, |
163 | | - "ins": ins, |
164 | | - "del": delt, |
165 | | - "sub": sub, |
166 | | - "iden": iden, |
167 | | - "acc": acc, |
168 | | - "qstart": read.query_alignment_start, |
169 | | - "qend": read.query_alignment_end, |
170 | | - "rstart": read.reference_start, |
171 | | - "rend": read.reference_end, |
172 | | - "ref": references[read.reference_id], |
173 | | - "aligned_ref_len": aligned_ref_len, |
174 | | - "ref_coverage": 100 * float(aligned_ref_len) / lengths[read.reference_id], |
175 | | - "mapq": read.mapping_quality, |
176 | | - "masked": masked, |
177 | | - } |
178 | | - return results |
179 | | - |
180 | 35 |
|
181 | 36 | def _process_reads(bam_fp, start_stop, all_alignments=False, min_length=None, bed_file=None): |
182 | 37 | start, stop = start_stop |
@@ -205,12 +60,12 @@ def _process_reads(bam_fp, start_stop, all_alignments=False, min_length=None, be |
205 | 60 | counts['masked'] += 1 |
206 | 61 | continue |
207 | 62 | else: |
208 | | - result = masked_stats_from_aligned_read(read, bam.references, bam.lengths, trees[read.reference_name]) |
| 63 | + result = masked_stats_from_aligned_read(read, trees[read.reference_name]) |
209 | 64 | if result is None: # no matches within bed regions |
210 | 65 | counts['all_matches_masked'] += 1 |
211 | 66 | continue |
212 | 67 | else: |
213 | | - result, lra_flag = stats_from_aligned_read(read, bam.references, bam.lengths) |
| 68 | + result, lra_flag = stats_from_aligned_read(read) |
214 | 69 |
|
215 | 70 | if min_length is not None and result['length'] < min_length: |
216 | 71 | counts['short'] += 1 |
|
0 commit comments