-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTextbook_BA2D.py
executable file
·70 lines (53 loc) · 2.47 KB
/
Textbook_BA2D.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python
'''
A solution to a code challenges that accompanies Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
The textbook is hosted on Stepic and the problem is listed on ROSALIND under the Textbook Track.
Problem Title: Greedy Motif Search
Chapter #: 02
Problem ID: D
URL: http://rosalind.info/problems/ba2d/
'''
from Textbook_02C import profile_most_probable_kmer
import os
def score(motifs):
'''Returns the score of the given list of motifs.'''
columns = [''.join(seq) for seq in zip(*motifs)]
max_count = sum([max([c.count(nucleotide) for nucleotide in 'ACGT']) for c in columns])
return len(motifs[0])*len(motifs) - max_count
def profile(motifs):
'''Returns the profile of the dna list motifs.'''
columns = [''.join(seq) for seq in zip(*motifs)]
return [[float(col.count(nuc)) / float(len(col)) for nuc in 'ACGT'] for col in columns]
def greedy_motif_search(dna_list, k, t):
'''Runs the Greedy Motif Search algorithm and retuns the best motif.'''
# Initialize the best score as a score higher than the highest possible score.
best_score = t*k
# Run the greedy motif search.
for i in xrange(len(dna_list[0])-k+1):
# Initialize the motifs as each k-mer from the first dna sequence.
motifs = [dna_list[0][i:i+k]]
# Find the most probable k-mer in the next string.
for j in xrange(1, t):
current_profile = profile(motifs)
motifs.append(profile_most_probable_kmer(dna_list[j], k, current_profile))
# Check to see if we have a new best scoring list of motifs.
current_score = score(motifs)
if current_score < best_score:
best_score = current_score
best_motifs = motifs
return best_motifs
def main():
'''Main call. Reads, runs, and saves problem specific data.'''
# Read the input data.
full_path = os.path.realpath(__file__)
with open(os.path.join(os.path.dirname(full_path),'data/rosalind_ba2d.txt') ) as input_data:
k, t = map(int, input_data.readline().split())
dna_list = [line.strip() for line in input_data]
# Run the Greedy Motif Search.
best_motifs = greedy_motif_search(dna_list, k, t)
# Print and save the answer.
print '\n'.join(best_motifs)
with open(os.path.join(os.path.dirname(full_path),'output/Textbook_ba2d.txt'), 'w') as output_data:
output_data.write('\n'.join(best_motifs))
if __name__ == '__main__':
main()