Peer_review/NPAS1.py at 638b0e1ee2b80d1ede74c3b783670b5055559bee · Python-BI-2023/Peer_review · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
from abc import ABC, abstractmethod
from random import choice

from Bio import SeqIO
from Bio.SeqUtils import gc_fraction


def filter_fastq(input_path: str, output_filename: str = None, gc_bounds=(0, 100), length_bounds=(0, 2 ** 32), quality_threshold=0):
    """
    Filter a FASTQ file based on specified criteria and save the filtered sequences to a new FASTQ file.

    Args:
        input_path (str): The path to the input FASTQ file.
        output_filename (str, optional): The name for the output FASTQ file. If not provided, the input file name is used.
        gc_bounds (tuple, optional): GC content filter bounds (default is (0, 100)).
        length_bounds (tuple, optional): Sequence length filter bounds (default is (0, 2**32)).
        quality_threshold (int, optional): Quality score threshold (default is 0).

    Returns:
        None: This function does not return a value but saves the filtered sequences to a new FASTQ file.
    """
    filtered_seqs = []

    for record in SeqIO.parse(input_path, "fastq"):
        gc_percent = gc_fraction(str(record.seq)) * 100
        seq_len = len(record)
        mean_offset = sum(record.letter_annotations["phred_quality"]) / len(record)

        if gc_bounds[0] <= gc_percent <= gc_bounds[1] and \
                length_bounds[0] <= seq_len <= length_bounds[1] and \
                mean_offset >= quality_threshold:
            filtered_seqs.append(record)

    if output_filename is None:
        output_filename = input_path.split("/")[-1]
    else:
        output_filename = output_filename + ".fastq"

    output_path = output_filename
    SeqIO.write(filtered_seqs, output_path, "fastq")

class BiologicalSequence(ABC):
    """
        Abstract base class representing a biological sequence.

        Defines common operations for biological sequences, such as DNA, RNA, or protein sequences.

        Attributes:
            sequence (str): The sequence of characters representing the biological sequence.

        Methods:
            __len__(): Returns the length of the biological sequence.
            __getitem__(index): Gets the character at the specified index or returns a subsequence.
            __str__(): Returns a string representation of the biological sequence.
            is_valid_alphabet(): Checks if all characters in the sequence belong to a valid alphabet.
        """
    def __init__(self, sequence):
        self.sequence = sequence
        if not self.alphabet_checking():
            raise ValueError("Invalid characters in the sequence.")

    @abstractmethod
    def __len__(self):
        pass

    @abstractmethod
    def __getitem__(self, index):
        pass

    def __str__(self):
        return self.sequence

    def __repr__(self):
        return f"{self.__class__.__name__}('{self.sequence}')"

    def alphabet_checking(self):
        if not set(self.sequence) <= set(type(self).ALPHABET):
            return False
        return True


class NucleicAcidSequence(BiologicalSequence):
    """
        Represents a nucleic acid sequence, such as DNA or RNA.

        Inherits from BiologicalSequence and adds functionality specific to nucleic acid sequences.

        Attributes:
            sequence (str): The sequence of characters representing the nucleic acid sequence.

        Methods:
            complement(): Returns the complement of the nucleic acid sequence.
            gc_content(as_percentage=False): Calculates the GC content of the nucleic acid sequence.

        Overrides:
            is_valid_alphabet(): Checks if all characters in the sequence belong to a valid nucleic acid alphabet.
        """

    COMPLEMENT_DICT = {
        'A': 'T',
        'T': 'A',
        'C': 'G',
        'G': 'C',
        'a': 't',
        't': 'a',
        'c': 'g',
        'g': 'c'}

    def __init__(self, sequence):
        super().__init__(sequence)

    def __len__(self):
        return len(self.sequence)

    def __getitem__(self, index):
        return self.sequence[index]

    def complement(self):
        return ''.join(self.COMPLEMENT_DICT.get(base, base) for base in self.sequence)

    def gc_content(self, as_percentage=False):
        gc_count = self.sequence.count('G') + self.sequence.count('C')
        total_count = len(self.sequence)
        gc_content = gc_count / total_count if total_count > 0 else 0
        return gc_content * 100 if as_percentage else gc_content


class DNASequence(NucleicAcidSequence):
    ALPHABET = set("ATGCatgc")

    TRANSCRIBE_DICT = {
        'T': 'U',
        't': 'u'
    }

    def __init__(self, sequence):
        super().__init__(sequence)

    def transcribe(self):
        transcribed_seq = ''.join(self.TRANSCRIBE_DICT.get(base, base) for base in self.sequence)
        return transcribed_seq


class RNASequence(NucleicAcidSequence):
    ALPHABET = set("AUGCaugc")


class AminoAcidSequence(BiologicalSequence):
    """
        This function takes  aminoacid sequence and translates in to the RNA.
        As most of the aminoacids are coded with several different codons,
        this function will take a random codon of the set for such aminoacids.

        Arguments:
            seq (str): A sequence of RNA molecule

        Output:
            returns sequence of aminoacids
        """

    ALPHABET = set("ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy")

    def __init__(self, sequence):
        super().__init__(sequence)

    def __len__(self):
        return len(self.sequence)

    def __getitem__(self, index):
        return self.sequence[index]

    def translate_to_rna(self):
        AA_CODON_DICT = {
            "G": ["GGA", "GGU", "GGC", "GGG"],
            "R": ["AGA", "AGG", "CGA", "CGC", "CGG", "CGU"],
            "S": ["AGC", "AGU", "UCA", "UCC", "UCG", "UCU"],
            "E": ["GAA", "GAG"],
            "P": ["CCA", "CCC", "CCG", "CCU"],
            "L": ["CUA", "CUC", "CUG", "CUU", "UUA", "UUG"],
            "V": ["GUA", "GUC", "GUG", "GUU"],
            "T": ["ACA", "ACC", "ACG", "ACU"],
            "A": ["GCA", "GCC", "GCG", "GCU"],
            "I": ["AUA", "AUC", "AUU"],
            "F": ["UUC", "UUU"],
            "H": ["CAC", "CAU"],
            "Y": ["UAC", "UAU"],
            "Q": ["CAA", "CAG"],
            "C": ["UGC", "UGU"],
            "N": ["AAC", "AAU"],
            "D": ["GAC", "GAU"],
            "K": ["AAA", "AAG"],
            "M": ["AUG"],
            "W": ["UGG"],
        }
        rna_seq = ""
        for aa in self.sequence:
            codon = choice(AA_CODON_DICT[aa])
            rna_seq += codon
        return rna_seq