dip-c/seg.py at master · tanlongzhi/dip-c · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import sys
import getopt
import pysam
import gzip
from classes import Haplotypes, Seg, Read, SegData

# add the primary alignment as a segment to a read, if satisfying conditions
def add_primary_seg(read, bam_read, min_mapq, max_nm_per_bp):
    if bam_read.mapping_quality < min_mapq or float(bam_read.get_tag("NM")) / (bam_read.query_alignment_end - bam_read.query_alignment_start) > max_nm_per_bp:
        return
    read.add_seg(Seg(bam_read.is_read2, bam_read.query_alignment_start, bam_read.query_alignment_end, bam_read.reference_name, bam_read.reference_start, bam_read.reference_end, bam_read.is_reverse))

# add all SA alignments as segments to a read, if satisfying conditions
def add_sa_segs(read, bam_read, min_mapq, max_nm_per_bp):
    sa_tags = bam_read.get_tag("SA")
    # read each part of the SA tag
    for sa_tag in sa_tags.split(";")[:-1]:
        ref_name, ref_start, strand, cigar, mapping_quality, edit_distance = sa_tag.split(",")
        mapping_quality = int(mapping_quality)
        edit_distance = int(edit_distance)
        ref_start = int(ref_start)
        if mapping_quality < min_mapq:
            continue
        # hack pysam's current alignment (primary) to get SA alignment
        bam_read.reference_start = int(ref_start)
        bam_read.cigarstring = cigar
        if bam_read.is_reverse == (strand == "-"):
            query_start = bam_read.query_alignment_start
            query_end = bam_read.query_alignment_end
        else:
            query_start = bam_read.query_length - bam_read.query_alignment_end
            query_end = bam_read.query_length - bam_read.query_alignment_start
        if float(edit_distance) / (query_end - query_start) > max_nm_per_bp:
            continue
        read.add_seg(Seg(bam_read.is_read2, query_start, query_end, ref_name, ref_start, bam_read.reference_end, True if strand == "-" else False))

def seg(argv):
    # default parameters
    min_mapq = 20
    max_nm_per_bp = 0.05
    min_baseq = 20
    snp_file_name = None

    # progress display parameters
    display_num_bam_reads = 1e5
    display_num_snps = 1e4

    # read arguments
    try:
        opts, args = getopt.getopt(argv[1:], "q:m:v:")
    except getopt.GetoptError as err:
        sys.stderr.write("[E::" + __name__ + "] unknown command\n")
        return 1
    if len(args) == 0:
        sys.stderr.write("Usage: dip-c seg [options] <in.bam>\n")
        sys.stderr.write("Options:\n")
        sys.stderr.write("  -q INT          min mapping quality [" + str(min_mapq) + "]\n")
        sys.stderr.write("  -m FLOAT        max edit distance per bp of aligned read [" + str(max_nm_per_bp) + "]\n")
        sys.stderr.write("  -v <snps.txt>   SNP file:\n")
        sys.stderr.write("                    tab-delimited: chr, pos, paternal nucleotide, maternal nucleotide\n")
        sys.stderr.write("  -Q              min base quality (if -v) [" + str(min_baseq) + "]\n")
        return 1
    for o, a in opts:
        if o == "-q":
            min_mapq = int(a)
        elif o == "-m":
            max_nm_per_bp = float(a)
        elif o == "-v":
            snp_file_name = a

    # initialize data structure
    seg_data = SegData()

    # pass 1: find reads with SA tags and/or discordant reads
    bam_file = pysam.AlignmentFile(args[0], "rb")
    num_bam_reads = 0
    for bam_read in bam_file.fetch(until_eof=True):
        num_bam_reads += 1
        if num_bam_reads % display_num_bam_reads == 0:
            sys.stderr.write("[M::" + __name__ + "] pass 1: read " + str(num_bam_reads) + " alignments, last at " + (bam_read.reference_name + ":" + str(bam_read.reference_start) if not bam_read.is_unmapped else "*") + "; added " + str(seg_data.num_reads()) + " candidate reads\n")

        # exclude bad reads
        if bam_read.is_unmapped or bam_read.is_supplementary or bam_read.is_duplicate or bam_read.is_qcfail or bam_read.is_secondary:
            continue

        # get segments from reads
        if bam_read.has_tag("SA"):
            read = Read(bam_read.query_name)
            add_primary_seg(read, bam_read, min_mapq, max_nm_per_bp)
            add_sa_segs(read, bam_read, min_mapq, max_nm_per_bp)
        elif bam_read.is_paired and not bam_read.is_proper_pair:
            read = Read(bam_read.query_name)
            add_primary_seg(read, bam_read, min_mapq, max_nm_per_bp)
        else:
            continue
        seg_data.add_read(read)
    sys.stderr.write("[M::" + __name__ + "] pass 1 done: read " + str(num_bam_reads) + " alignments; added " + str(seg_data.num_reads()) + " candidate reads (" + str(round(100.0 * seg_data.num_reads() / num_bam_reads, 2)) + "% of alignments)\n")

    # pass 2: find mates of all reads from pass 1, and then removes reads with only one segment
    num_bam_reads = 0
    bam_file.reset() # not a documented API, but works
    for bam_read in bam_file.fetch(until_eof=True):
        num_bam_reads += 1
        if num_bam_reads % display_num_bam_reads == 0:
            sys.stderr.write("[M::" + __name__ + "] pass 2: read " + str(num_bam_reads) + " alignments, last at " + (bam_read.reference_name + ":" + str(bam_read.reference_start) if not bam_read.is_unmapped else "*") + "\n")
        if bam_read.has_tag("SA") or bam_read.is_paired and not bam_read.is_proper_pair:
            continue # would have been added in the first round
        if seg_data.contains_read_name(bam_read.query_name):
            read = Read(bam_read.query_name)
            add_primary_seg(read, bam_read, min_mapq, max_nm_per_bp)
        else:
            continue
        seg_data.add_read(read)
    sys.stderr.write("[M::" + __name__ + "] pass 2: cleaning " + str(seg_data.num_reads()) + " candidate reads\n")
    seg_data.clean()
    sys.stderr.write("[M::" + __name__ + "] pass 2 done: read " + str(num_bam_reads) + " alignments; kept " + str(seg_data.num_reads()) + " candidate reads (" + str(round(100.0 * seg_data.num_reads() / num_bam_reads, 2)) + "% of alignments)\n")

    # pass 3: (if -v) find haplotype information based on the SNP file
    if not snp_file_name is None:
        num_snps = 0
        snp_file = gzip.open(snp_file_name, "rb") if snp_file_name.endswith(".gz") else open(snp_file_name, "rb")
        for snp_file_line in snp_file:
            num_snps += 1
            snp_chr, snp_locus, snp_pat, snp_mat = snp_file_line.strip().split()
            snp_locus = int(snp_locus)
            if num_snps % display_num_snps == 0:
                sys.stderr.write("[M::" + __name__ + "] pass 3: read " + str(num_snps) + " SNPs, last at " + snp_chr + ":" + str(snp_locus) + "\n")

            for pileup_column in bam_file.pileup(snp_chr, snp_locus - 1, snp_locus):
                if pileup_column.pos == snp_locus - 1:
                    for pileup_read in pileup_column.pileups:
                        if not pileup_read.is_del and not pileup_read.is_refskip:
                            # skip read if not a candidate
                            if not seg_data.contains_read_name(pileup_read.alignment.query_name):
                                continue
                            # discard if low base quality
                            if pileup_read.alignment.query_qualities[pileup_read.query_position] < min_baseq:
                                continue

                            # phase the corresponding segment
                            seg_haplotype = -1
                            if pileup_read.alignment.query_sequence[pileup_read.query_position] == snp_pat:
                                seg_haplotype = 0
                            elif pileup_read.alignment.query_sequence[pileup_read.query_position] == snp_mat:
                                seg_haplotype = 1
                            else:
                                continue
                            seg_data.update_haplotype(pileup_read.alignment.query_name, pileup_read.alignment.is_read2, snp_chr, snp_locus, seg_haplotype)
        sys.stderr.write("[M::" + __name__ + "] pass 3 done: read " + str(num_snps) + " SNPs\n")

    # write output
    sys.stderr.write("[M::" + __name__ + "] writing output for " + str(seg_data.num_reads()) + " candidate reads, containing " + str(seg_data.num_segs()) + " segments (" + str(round(100.0 * seg_data.num_phased_segs() / seg_data.num_segs(), 2)) + "% phased)\n")
    sys.stdout.write(seg_data.to_string()+"\n")

    return 0