-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathNeedleman_Wunsch.py
More file actions
161 lines (119 loc) · 4.74 KB
/
Needleman_Wunsch.py
File metadata and controls
161 lines (119 loc) · 4.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from nis import match
from tempfile import tempdir
from cv2 import ml_TrainData
import numpy as np
import itertools
'''
ON ALIGNMENT:
Global Seqnce alignment:
Assumes two sequences are homologous (share common evolutionary ancestor)
across their whole length. Tries to align the whole genome from tail to nose.
Assumption for two genes in the same gene family
Local Alignment:
Assumes that there are regions of homology within a bigger sequence,
but also regions that do not share common ancestry.
Reasonable assumption if checking how many genes two species share.
If the species aren't extremely closely related, we expect shared genes and
large portions of genome that's not shared.
'''
'''
LOGIC:
1) fill in matrix:
horizontal movements: gap penalty
diagonal movements: match / mismatch
store all 3 possible values (dynamic programming) and take the maximum as final value
2) Backtracing:
From bottom of matrix where highest score is, continue up to the upper left conner
If the bses match, go up diagonally, if not, go to the highest value neighbor
'''
def score_matrix_v0(seq, ref, match_score, gap_cost):
'''
(v0): this funciton builds a score matrix using dynamic programming
:param seq: user defined sequence
:param ref: user defined reference sequence
:param match_socre: user defined match score
:param gap_cost: user defined gap penalty
'''
# initialize matrix
matrix = np.zeros((5,6))
matrix[0,1:] = [i for i in range(gap_cost,((len(ref)+1)*gap_cost),gap_cost)]
matrix[1:,0] = [i for i in range(gap_cost,((len(seq)+1)*gap_cost),gap_cost)]
for i in range(1, len(seq)+1): # +1 here bc matrix has one additional cell (0)
possible_scores = 0
for j in range(1, len(ref)+1):
gap_j = matrix[i-1,j] + gap_cost # (i-1,j): left
gap_i = matrix[i,j-1] + gap_cost # (i,j-1):up
possible_scores = max(gap_j, gap_i)
if seq[i-1] == ref[j-1]: # if match
diag = matrix[i-1,j-1] + match_score
else: # if mismatch
diag = matrix[i-1,j-1] - match_score
if diag > possible_scores:
possible_scores = diag
matrix[i,j] = possible_scores
return matrix
def score_matrix_v1(seq, ref, match_score, gap_cost):
'''
(v1) this function builds a score matrix using itertools and numpy (more space efficient)
:param seq:
:param ref:
:param match_socre:
:param gap_cost:
'''
matrix = np.zeros((len(seq) + 1, len(ref) + 1))
matrix[0,:] = np.linspace(0, (len(ref)*gap_cost),len(ref)+1)
matrix[:,0] = np.linspace(0, (len(seq)*gap_cost),len(seq)+1)
for i, j in itertools.product(range(1, matrix.shape[0]), range(1, matrix.shape[1])):
match = matrix[i-1,j-1] + (match_score if seq[i-1] == ref[j-1] else - match_score)
deletion = matrix[i-1,j] + gap_cost
insertion = matrix[i, j-1] + gap_cost
matrix[i,j] = max(match, deletion, insertion)
return matrix
def needleman_wunsch(matrix):
'''
This function backtraces the score matrix and returns alignment result.
LOGIC:
1) go to the last filled value of the matrix
2) breaks down socre matrix into 2x2 sub-matrices, find largest neighboring value or move diagonally
3) append base to sequence to reconstruct alignment
4) returns alignment (flips strings since backtracing)
:param matrix: score matrix constructed
'''
# for indexing: since seq and ref are one short than matrix dim
(m,n) = np.subtract(matrix.shape, (1,1))
# store alignment info
align_seq = ''
align_ref = ''
# scores and directions
diagonal = (1,1)
left = (0,1)
up = (1,0)
while m>0 and n>0:
# look at 2x2 matrix
m_temp = matrix[m-1:m+1,n-1:n+1]
# if sequences match, go back diagonally
if seq[m-1] == ref[n-1]:
align_seq += seq[m-1]
align_ref += ref[n-1]
(m, n) = np.subtract((m,n), diagonal)
else:
# find max value neighbors (3 neighbor values)
if np.argsort(m_temp.flatten()[:-1])[-1] == 1:
align_ref += '-'
align_seq += ref[n-1]
(m,n) = np.subtract((m,n), up)
elif np.argsort(m_temp.flatten()[:-1])[-1] == 2:
align_seq += '-'
align_ref += ref[n-1]
(m,n) = np.subtract((m,n), left)
return (align_ref[::-1], align_seq[::-1])
if __name__ == "__main__":
match_score = 1
gap_cost = -2
seq = 'AGCT'
ref = 'ATGCT'
matrix = score_matrix_v1(seq, ref, match_score, gap_cost)
align_result = needleman_wunsch(matrix)
print("alignment result:")
print("ref:", align_result[0])
print("seq:", align_result[1])