-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclean_junk.py
More file actions
90 lines (80 loc) · 3.84 KB
/
clean_junk.py
File metadata and controls
90 lines (80 loc) · 3.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import sys
import copy
import math
import bisect
# input:
# output:
# read IO locations from arguments
inputFile=open(sys.argv[1],"r")
outputFile=open(sys.argv[2],"w")
# parameters
cleanDistance = 10e6
sqrtCleanDistance = math.sqrt(cleanDistance)
#cleanCounts = 3 # for Nagano
#cleanCounts = 10 # for Meta-C
cleanCounts = 5 # for Meta-C blood
#cleanCounts = 2 # for Meta-C sperm shallow
legDistance = 500
legCounts = 10
# load contact data
contactData = {}
legData = {}
for inputLine in inputFile:
inputLineData = inputLine.strip().split("\t")
for i in [1,2,4,5]:
inputLineData[i] = int(inputLineData[i])
# add to contact list
if not inputLineData[0] in contactData:
contactData[inputLineData[0]] = {}
if not inputLineData[3] in contactData[inputLineData[0]]:
contactData[inputLineData[0]][inputLineData[3]] = []
contactData[inputLineData[0]][inputLineData[3]].append(inputLineData)
# add to leg list
if not inputLineData[0] in legData:
legData[inputLineData[0]] = []
if not inputLineData[3] in legData:
legData[inputLineData[3]] = []
legData[inputLineData[0]].append(inputLineData[1])
legData[inputLineData[3]].append(inputLineData[4])
# sort data
for leftChr in contactData:
for rightChr in contactData[leftChr]:
contactData[leftChr][rightChr] = sorted(contactData[leftChr][rightChr])
for legChr in legData:
legData[legChr] = sorted(legData[legChr])
# remove bad legs by counting identical legs
tempContactData = {}
for leftChr in contactData:
tempContactData[leftChr] = {}
for rightChr in contactData[leftChr]:
tempContactData[leftChr][rightChr] = []
print(f"removing {leftChr}, {rightChr}")
for contact in contactData[leftChr][rightChr]:
numOfLeftLegs = bisect.bisect_right(legData[leftChr][bisect.bisect_left(legData[leftChr],contact[1] - legDistance):],contact[1] + legDistance)
numOfRightLegs = bisect.bisect_right(legData[rightChr][bisect.bisect_left(legData[rightChr],contact[4] - legDistance):],contact[4] + legDistance)
if numOfLeftLegs <= legCounts and numOfRightLegs <= legCounts:
tempContactData[leftChr][rightChr].append(contact)
contactData = tempContactData
# clean data
for leftChr in contactData:
for rightChr in contactData[leftChr]:
print(f"cleaning {leftChr}, {rightChr}")
numOfContacts = len(contactData[leftChr][rightChr])
for i in range(numOfContacts):
nearbyCounts = 0
for j in range(i+1, numOfContacts):
if nearbyCounts >= cleanCounts:
break
if contactData[leftChr][rightChr][j][1] - contactData[leftChr][rightChr][i][1] > cleanDistance:
break
if math.sqrt(contactData[leftChr][rightChr][j][1] - contactData[leftChr][rightChr][i][1]) + math.sqrt(abs(contactData[leftChr][rightChr][j][4] - contactData[leftChr][rightChr][i][4])) <= sqrtCleanDistance:
nearbyCounts += 1
for j in range(0, i):
if nearbyCounts >= cleanCounts:
break
if contactData[leftChr][rightChr][i][1] - contactData[leftChr][rightChr][j][1] > cleanDistance:
break
if math.sqrt(contactData[leftChr][rightChr][i][1] - contactData[leftChr][rightChr][j][1]) + math.sqrt(abs(contactData[leftChr][rightChr][j][4] - contactData[leftChr][rightChr][i][4])) <= sqrtCleanDistance:
nearbyCounts += 1
if nearbyCounts < cleanCounts:
outputFile.write(contactData[leftChr][rightChr][i][0]+'\t'+str(contactData[leftChr][rightChr][i][1])+'\t'+str(contactData[leftChr][rightChr][i][2])+'\t'+contactData[leftChr][rightChr][i][3]+'\t'+str(contactData[leftChr][rightChr][i][4])+'\t'+str(contactData[leftChr][rightChr][i][5])+'\n')