NucleicTools/bio_files_processor.py at main · alexk-git/NucleicTools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
'''
    Functions for reading bioinformatics files.

    convert_multiline_fasta_to_oneline: reads a fasta file supplied as input,
    in which the sequence (DNA/RNA/protein/etc.) can be split into several lines,
    and then saves it into a new fasta file in which each sequence fits one line.

    select_genes_from_gbk_to_fasta: function receives a GBK-file as input,
    extracts the specified number of genes before and after each gene of interest (gene),
    and saves their protein sequence (translation) to a fasta file.

'''

from pathlib import Path
from typing import Union
import json
import modules.fastq_tools

def convert_multiline_fasta_to_oneline(input_fasta: str) -> None:
    '''
    A function processes fasta file: reads a fasta file supplied as input_fasta (str),
    where one sequence (DNA/RNA/protein/etc.) can be split into several lines,
    and then saves it into a new fasta file (output_fasta) where each sequence fits one line.

    Input:
        input_fasta: file with the input sequences

    Arguments:
        input_fasta: str

    Returns:
        output_fasta: a file where each sequence fits one line

    Raises:
        exceptions if something went wrong.
    '''

    path_to_write = Path(f"output_{input_fasta}")

    with open(input_fasta, "r") as file:
        rez = {}
        line = file.readline()
        while line:
            if line and line[0] == '>':
                key_line = line.strip()
                rez[key_line] = []
                n_line = file.readline().strip()
                while n_line and n_line[0] != '>':
                    if n_line:
                        rez[key_line].append(n_line)
                    n_line = file.readline().strip()
                    line = n_line
            else:
                line = file.readline().strip()

        with open(path_to_write, "w") as file_w:
            for k in rez.keys():
                file_w.write(k+'\n')
                file_w.write(''.join(rez[k])+'\n')

def parse_blast_output(input_gbk: str, genes: Union[int, tuple, list], output_fasta: str, n_before: int = 1, n_after: int = 1):
    '''
    Receives a GBK-file as input, extracts the specified number of genes before and after each gene of interest (gene),
    and saves their protein sequence (translation) to a fasta file.

    Arguments:
        input_gbk: path to the input GBK file
        genes: genes of interest, near which neighbors are searched (string or collection of strings).
        n_before, n_after: number of genes before and after (>=1)
        output_fasta: output file name.

    Returns:
        output_fasta: a file where each sequence fits one line

    Raises:
        exceptions if something went wrong.
    '''

    genes_parsed = {}
    gene_count = 0
    current_gene = None
    current_translation = ""
    in_translation = False
    in_cds = False
    path_to_write = Path(f"output_{input_gbk.split('.')[0]}")

    with open(input_gbk, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.rstrip()

            if line.startswith('     CDS'):
                if in_cds and current_gene is not None and current_translation:
                    gene_count += 1
                    genes_parsed[current_gene] = {
                        'gene_count': gene_count,
                        'gene': current_gene,
                        'translation': current_translation
                    }

                in_cds = True
                current_gene = None
                current_translation = ""
                in_translation = False
                continue

            if in_cds:
                if line and not line.startswith('                     '):
                    if current_gene is not None and current_translation:
                        gene_count += 1
                        genes_parsed[current_gene] = {
                            'gene_count': gene_count,
                            'gene': current_gene,
                            'translation': current_translation
                        }

                    in_cds = line.startswith('     CDS')
                    current_gene = None
                    current_translation = ""
                    in_translation = False
                    if in_cds:
                        continue

                if '/gene=' in line and current_gene is None:
                    gene_part = line.split('/gene=')[1].strip()
                    if gene_part.startswith('"'):
                        current_gene = gene_part.split('"')[1]
                    else:
                        current_gene = gene_part.split()[0].strip('"')

                elif '/translation=' in line:
                    in_translation = True
                    translation_part = line.split('/translation=')[1].strip()
                    if translation_part.startswith('"'):
                        translation_part = translation_part[1:]
                    if translation_part.endswith('"'):
                        translation_part = translation_part[:-1]
                        in_translation = False

                    current_translation = translation_part

                elif in_translation:
                    clean_line = line.strip()

                    if '"' in clean_line:
                        clean_line = clean_line.split('"')[0]
                        in_translation = False

                    if clean_line.startswith('/'):
                        in_translation = False
                    else:
                        current_translation += clean_line

    if in_cds and current_gene is not None and current_translation:
        gene_count += 1
        genes_parsed[current_gene] = {
            'gene_count': gene_count,
            'gene': current_gene,
            'translation': current_translation
        }

    #print(genes_parsed)

    # For future use:
    # to avoid new-parsing the if the data is already collected in a convenient form (dictionary)
    with open(f'{input_gbk.split(".")[0]}.json', 'w', encoding='utf-8') as f:
        json.dump(genes_parsed, f, ensure_ascii=False, indent=4)

    print(f'saved human readable version of {input_gbk} in {input_gbk.split(".")[0]}.json')

    genes_of_interests = modules.fastq_tools.find_genes_with_neighbors(genes_parsed, genes, n_before, n_after)

    modules.fastq_tools.write_genes_seq_to_fasta(genes_of_interests, path_to_write)

    return None