-
Notifications
You must be signed in to change notification settings - Fork 24
Expand file tree
/
Copy pathbincon.py
More file actions
126 lines (114 loc) · 5.69 KB
/
bincon.py
File metadata and controls
126 lines (114 loc) · 5.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import sys
import gzip
import getopt
from classes import Haplotypes, string_to_leg, ref_name_haplotype_to_hom_name, ConData, file_to_con_data, Con, Leg
import numpy as np
from scipy import spatial
import math
def leg_to_matrix_index(leg, hom_offsets, matrix_bin_size, merge_haplotypes):
ref_name, haplotype = leg.ref_name_haplotype()
if merge_haplotypes:
# merge both haplotypes into "paternal"
haplotype = Haplotypes.paternal
hom_name = ref_name_haplotype_to_hom_name((ref_name, haplotype))
ref_locus = leg.get_ref_locus()
if hom_name not in hom_offsets:
return None
return hom_offsets[hom_name] + int(round(float(ref_locus) / matrix_bin_size))
def con_to_matrix_index(con, hom_offsets, matrix_bin_size, merge_haplotypes):
matrix_index_1 = leg_to_matrix_index(con.leg_1(), hom_offsets, matrix_bin_size, merge_haplotypes)
matrix_index_2 = leg_to_matrix_index(con.leg_2(), hom_offsets, matrix_bin_size, merge_haplotypes)
return matrix_index_1, matrix_index_2
def con_data_to_matrix(con_data, hom_offsets, matrix_bin_size, matrix_size, merge_haplotypes, display_num_cons):
# convert to matrix
sys.stderr.write("[M::" + __name__ + "] generating a " + str(matrix_size) + " x " + str(matrix_size) + " matrix\n")
matrix_data = np.zeros((matrix_size, matrix_size), dtype = int)
num_cons = 0
for con in con_data.get_cons():
num_cons += 1
if num_cons % display_num_cons == 0:
sys.stderr.write("[M::" + __name__ + "] processed " + str(num_cons) + " contacts\n")
matrix_index_1, matrix_index_2 = con_to_matrix_index(con, hom_offsets, matrix_bin_size, merge_haplotypes)
if matrix_index_1 is None or matrix_index_2 is None:
continue
matrix_data[matrix_index_1, matrix_index_2] += 1
matrix_data[matrix_index_2, matrix_index_1] += 1
return matrix_data
def bincon(argv):
# default parameters
chr_len_file_name = None
matrix_bin_size = 1000000
merge_haplotypes = False
info_mode = False
leg_mode = False
min_separation = 0
# progress display parameters
display_num_cons = 1e4
# read arguments
try:
opts, args = getopt.getopt(argv[1:], "l:b:HiLs:")
except getopt.GetoptError as err:
sys.stderr.write("[E::" + __name__ + "] unknown command\n")
return 1
if len(args) == 0:
sys.stderr.write("Usage: dip-c bincon [options] -l <chr.len> <in.con>\n")
sys.stderr.write("Options:\n")
sys.stderr.write(" -l <chr.len> file containing chromosome lengths (tab-delimited: chr, len)\n")
sys.stderr.write(" -L analyze LEG instead of CON\n")
sys.stderr.write(" -b INT bin size (bp) (bins are centered around multiples of bin size) [" + str(matrix_bin_size) + "]\n")
sys.stderr.write(" -H merge the two haplotypes\n")
sys.stderr.write(" -s INT min separation (bp) for intra-chromosomal contacts [" + str(min_separation) + "]\n")
sys.stderr.write(" -i output bin info (tab-delimited: homolog or chr if \"-H\", bin center) instead\n")
return 1
num_color_schemes = 0
for o, a in opts:
if o == "-l":
matrix_mode = True
chr_len_file_name = a
elif o == "-s":
min_separation = int(a)
elif o == "-b":
matrix_bin_size = int(a)
elif o == "-H":
merge_haplotypes = True
elif o == "-i":
info_mode = True
elif o == "-L":
leg_mode = True
if chr_len_file_name is None:
sys.stderr.write("[E::" + __name__ + "] -l is required\n")
return 1
# read chromosome lengths
hom_lens = {}
hom_bin_lens = {}
hom_offsets = {}
matrix_size = 0
chr_len_file = open(chr_len_file_name, "rb")
for chr_len_file_line in chr_len_file:
ref_name, ref_len = chr_len_file_line.strip().split("\t")
ref_len = int(ref_len)
for haplotype in ([Haplotypes.paternal] if merge_haplotypes else [Haplotypes.paternal, Haplotypes.maternal]):
hom_name = ref_name_haplotype_to_hom_name((ref_name, haplotype))
hom_bin_len = int(round(float(ref_len) / matrix_bin_size)) + 1
hom_lens[hom_name] = ref_len
hom_bin_lens[hom_name] = hom_bin_len
hom_offsets[hom_name] = matrix_size
matrix_size += hom_bin_len
if info_mode:
for bin_id in range(hom_bin_len):
sys.stdout.write("\t".join([(ref_name if merge_haplotypes else hom_name), str(bin_id * matrix_bin_size)]) + "\n")
# generate matrix
if not info_mode:
if leg_mode:
matrix_data = np.zeros((matrix_size, 1), dtype = int)
for leg_file_line in open(args[0], "rb"):
leg = string_to_leg(leg_file_line.strip())
matrix_data[leg_to_matrix_index(leg, hom_offsets, matrix_bin_size, merge_haplotypes)] += 1
else:
con_file = gzip.open(args[0], "rb") if args[0].endswith(".gz") else open(args[0], "rb")
con_data = file_to_con_data(con_file)
con_data.clean_separation(min_separation)
sys.stderr.write("[M::" + __name__ + "] read " + str(con_data.num_cons()) + " putative contacts (" + str(round(100.0 * con_data.num_intra_chr() / con_data.num_cons(), 2)) + "% intra-chromosomal, " + str(round(100.0 * con_data.num_phased_legs() / con_data.num_cons() / 2, 2)) + "% legs phased)\n")
matrix_data = con_data_to_matrix(con_data, hom_offsets, matrix_bin_size, matrix_size, merge_haplotypes, display_num_cons)
np.savetxt(sys.stdout, matrix_data, fmt='%i', delimiter='\t')
return 0