-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathphase.py
More file actions
71 lines (65 loc) · 3.15 KB
/
phase.py
File metadata and controls
71 lines (65 loc) · 3.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import sys
import copy
import pysam
# input: sorted BAM + phased VCF + precontact.py output
# output: read name + a list of segments (is read 2, query start, query end, ref name, ref start, ref end, strand, haplotype: 0=left, 1=right, -1=unknown, -2=disagree)
# read IO locations from arguments
inputBamFile=sys.argv[1]
inputVcfFile=sys.argv[2]
inputFile=open(sys.argv[3],"r")
outputFile=open(sys.argv[4],"w")
# open BAM files
samfile = pysam.AlignmentFile(inputBamFile, "rb")
# load Meta-C data into memory
inputData = {}
for inputLine in inputFile:
inputLineData = inputLine.strip().split("\t")
inputData[inputLineData[0]] = []
for inputAlignment in inputLineData[1:]:
inputAlignmentData = inputAlignment.split(",")
inputAlignmentData.append("-1")
inputData[inputLineData[0]].append(inputAlignmentData)
print("finished loading Meta-C data")
# open VCF file
bcf_in = pysam.VariantFile(inputVcfFile)
vcfSample = bcf_in.header.samples[0]
for rec in bcf_in.fetch():
vcfGenotype = rec.samples[vcfSample]["GT"]
vcfAlleles = rec.alleles
# keep only het biallelic SNPs
if len(vcfGenotype) != 2 or vcfGenotype[0] + vcfGenotype[1] != 1 or len(vcfAlleles[0]) != 1 or len(vcfAlleles[1]) != 1:
continue
# find nucleotide for each haplotype
leftNucleotide = vcfAlleles[vcfGenotype[0]]
rightNucleotide = vcfAlleles[vcfGenotype[1]]
vcfChr = rec.chrom[3:]
for pileupcolumn in samfile.pileup(vcfChr, rec.pos-1, rec.pos): # here we remove "chr" from VCF's chromosome name
if pileupcolumn.pos == rec.pos - 1: # find the position of the SNP
for pileupread in pileupcolumn.pileups:
if not pileupread.is_del and not pileupread.is_refskip:
currentName = pileupread.alignment.query_name
currentNucleotide = pileupread.alignment.query_sequence[pileupread.query_position]
currentHaplotype = "-1"
if currentNucleotide == leftNucleotide:
currentHaplotype = "0"
elif currentNucleotide == rightNucleotide:
currentHaplotype = "1"
if currentHaplotype == "-1":
continue
if not currentName in inputData:
continue
inputReadData = inputData[currentName]
for inputAlignmentData in inputReadData:
if inputAlignmentData[3] == vcfChr and rec.pos - 1 >= int(inputAlignmentData[4]) and rec.pos <= int(inputAlignmentData[5]):
if inputAlignmentData[-1] == "-1":
inputAlignmentData[-1] = currentHaplotype
elif inputAlignmentData[-1] != currentHaplotype:
inputAlignmentData[-1] = "-2"
print("finished phasing Meta-C data")
# print output
for inputRead in inputData:
outputFile.write(inputRead)
for inputAlignmentData in inputData[inputRead]:
outputFile.write('\t'+",".join(inputAlignmentDataField for inputAlignmentDataField in inputAlignmentData))
outputFile.write("\n")
print("finished writing output")