-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTextbook_BA1I.py
79 lines (67 loc) · 3.11 KB
/
Textbook_BA1I.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python
'''
A solution to a code challenges that accompanies Bioinformatics Algorithms: An Active-Learning Approach by Phillip Compeau & Pavel Pevzner.
The textbook is hosted on Stepic and the problem is listed on ROSALIND under the Textbook Track.
Problem Title: Find All Approximate Occurrences of a Pattern in a String
Chapter #: 01
Problem ID: I
URL: http://rosalind.info/problems/ba1i/
'''
from collections import OrderedDict
from operator import itemgetter
from itertools import product
from collections import defaultdict
from itertools import combinations, product, izip
import os
def pattern_mismatches(kmer, d):
# initialize mismatches with the k-mer itself (i.e. d=0).
mismatches = [kmer]
alt_bases = {'A':'CGT', 'C':'AGT', 'G':'ACT', 'T':'ACG'}
for dist in xrange(1, d+1):
for change_indices in combinations(xrange(len(kmer)), dist):
for substitutions in product(*[alt_bases[kmer[i]] for i in change_indices]):
new_mistmatch = list(kmer)
for idx, sub in izip(change_indices, substitutions):
new_mistmatch[idx] = sub
mismatches.append(''.join(new_mistmatch))
return mismatches
def most_freq_words_with_mismatchs(dna, k, d):
result_mismatch = []
if not k <= 12 and k >= 1:
raise ValueError("Pattern length must be between 0 and 12. {} was passed.".format(k))
if not d <= 3 and d >= 1:
raise ValueError("Max. mismatch must be between 0 and 3. {} was passed.".format(d))
motif_words = defaultdict(int)
for i in range(len(dna)-k+1): # split in every space.
motif_words[dna[i:i+k]] += 1
# Get all of the mismatches for each unique motif word in the sequence
mismatch_count = defaultdict(int)
for kmer, freq in motif_words.iteritems():
for mismatch in pattern_mismatches(kmer, d):
mismatch_count[mismatch] += freq
max_count_num = max(mismatch_count.values())
for i,word in enumerate(sorted(mismatch_count,key = mismatch_count.get,reverse = True)[:len(dna)]):
# sorts the dict by the values, from top to botton, takes the all items sorted from high to low
print(word, mismatch_count[word])
if (mismatch_count[word] == max_count_num):
result_mismatch.append(word)
return result_mismatch
def main():
"""
Main call. Reads, runs, and saves problem specific data.
"""
# Read the input data.
full_path = os.path.realpath(__file__)
with open(os.path.join(os.path.dirname(full_path), 'data/rosalind_ba1i.txt')) as input_data:
dna = input_data.readline().strip()
params = input_data.readline().strip().split('\n')
split_params = [n for n in params[0].split(' ')]
k_mere = int(split_params[0])
d_mis = int(split_params[1])
mismatches = most_freq_words_with_mismatchs(dna, k_mere, d_mis)
print(mismatches)
# Print and save the answer.
with open(os.path.join(os.path.dirname(full_path), 'output/Textbook_ba1i.txt'), 'w') as output_data:
output_data.write(" ".join(str(i) for i in mismatches))
if __name__ == '__main__':
main()