-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbio_files_processor.py
More file actions
185 lines (139 loc) · 6.5 KB
/
bio_files_processor.py
File metadata and controls
185 lines (139 loc) · 6.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
'''
Functions for reading bioinformatics files.
convert_multiline_fasta_to_oneline: reads a fasta file supplied as input,
in which the sequence (DNA/RNA/protein/etc.) can be split into several lines,
and then saves it into a new fasta file in which each sequence fits one line.
select_genes_from_gbk_to_fasta: function receives a GBK-file as input,
extracts the specified number of genes before and after each gene of interest (gene),
and saves their protein sequence (translation) to a fasta file.
'''
from pathlib import Path
from typing import Union
import json
import modules.fastq_tools
def convert_multiline_fasta_to_oneline(input_fasta: str) -> None:
'''
A function processes fasta file: reads a fasta file supplied as input_fasta (str),
where one sequence (DNA/RNA/protein/etc.) can be split into several lines,
and then saves it into a new fasta file (output_fasta) where each sequence fits one line.
Input:
input_fasta: file with the input sequences
Arguments:
input_fasta: str
Returns:
output_fasta: a file where each sequence fits one line
Raises:
exceptions if something went wrong.
'''
path_to_write = Path(f"output_{input_fasta}")
with open(input_fasta, "r") as file:
rez = {}
line = file.readline()
while line:
if line and line[0] == '>':
key_line = line.strip()
rez[key_line] = []
n_line = file.readline().strip()
while n_line and n_line[0] != '>':
if n_line:
rez[key_line].append(n_line)
n_line = file.readline().strip()
line = n_line
else:
line = file.readline().strip()
with open(path_to_write, "w") as file_w:
for k in rez.keys():
file_w.write(k+'\n')
file_w.write(''.join(rez[k])+'\n')
def parse_blast_output(input_gbk: str, genes: Union[int, tuple, list], output_fasta: str, n_before: int = 1, n_after: int = 1):
'''
Receives a GBK-file as input, extracts the specified number of genes before and after each gene of interest (gene),
and saves their protein sequence (translation) to a fasta file.
Arguments:
input_gbk: path to the input GBK file
genes: genes of interest, near which neighbors are searched (string or collection of strings).
n_before, n_after: number of genes before and after (>=1)
output_fasta: output file name.
Returns:
output_fasta: a file where each sequence fits one line
Raises:
exceptions if something went wrong.
'''
genes_parsed = {}
gene_count = 0
current_gene = None
current_translation = ""
in_translation = False
in_cds = False
path_to_write = Path(f"output_{input_gbk.split('.')[0]}")
with open(input_gbk, 'r', encoding='utf-8') as file:
for line in file:
line = line.rstrip()
if line.startswith(' CDS'):
if in_cds and current_gene is not None and current_translation:
gene_count += 1
genes_parsed[current_gene] = {
'gene_count': gene_count,
'gene': current_gene,
'translation': current_translation
}
in_cds = True
current_gene = None
current_translation = ""
in_translation = False
continue
if in_cds:
if line and not line.startswith(' '):
if current_gene is not None and current_translation:
gene_count += 1
genes_parsed[current_gene] = {
'gene_count': gene_count,
'gene': current_gene,
'translation': current_translation
}
in_cds = line.startswith(' CDS')
current_gene = None
current_translation = ""
in_translation = False
if in_cds:
continue
if '/gene=' in line and current_gene is None:
gene_part = line.split('/gene=')[1].strip()
if gene_part.startswith('"'):
current_gene = gene_part.split('"')[1]
else:
current_gene = gene_part.split()[0].strip('"')
elif '/translation=' in line:
in_translation = True
translation_part = line.split('/translation=')[1].strip()
if translation_part.startswith('"'):
translation_part = translation_part[1:]
if translation_part.endswith('"'):
translation_part = translation_part[:-1]
in_translation = False
current_translation = translation_part
elif in_translation:
clean_line = line.strip()
if '"' in clean_line:
clean_line = clean_line.split('"')[0]
in_translation = False
if clean_line.startswith('/'):
in_translation = False
else:
current_translation += clean_line
if in_cds and current_gene is not None and current_translation:
gene_count += 1
genes_parsed[current_gene] = {
'gene_count': gene_count,
'gene': current_gene,
'translation': current_translation
}
#print(genes_parsed)
# For future use:
# to avoid new-parsing the if the data is already collected in a convenient form (dictionary)
with open(f'{input_gbk.split(".")[0]}.json', 'w', encoding='utf-8') as f:
json.dump(genes_parsed, f, ensure_ascii=False, indent=4)
print(f'saved human readable version of {input_gbk} in {input_gbk.split(".")[0]}.json')
genes_of_interests = modules.fastq_tools.find_genes_with_neighbors(genes_parsed, genes, n_before, n_after)
modules.fastq_tools.write_genes_seq_to_fasta(genes_of_interests, path_to_write)
return None