Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
200 changes: 200 additions & 0 deletions NPAS1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
from abc import ABC, abstractmethod
from random import choice

from Bio import SeqIO
from Bio.SeqUtils import gc_fraction


def filter_fastq(input_path: str, output_filename: str = None, gc_bounds=(0, 100), length_bounds=(0, 2 ** 32), quality_threshold=0):
"""
Filter a FASTQ file based on specified criteria and save the filtered sequences to a new FASTQ file.

Args:
input_path (str): The path to the input FASTQ file.
output_filename (str, optional): The name for the output FASTQ file. If not provided, the input file name is used.
gc_bounds (tuple, optional): GC content filter bounds (default is (0, 100)).
length_bounds (tuple, optional): Sequence length filter bounds (default is (0, 2**32)).
quality_threshold (int, optional): Quality score threshold (default is 0).

Returns:
None: This function does not return a value but saves the filtered sequences to a new FASTQ file.
"""
filtered_seqs = []

for record in SeqIO.parse(input_path, "fastq"):
gc_percent = gc_fraction(str(record.seq)) * 100
seq_len = len(record)
mean_offset = sum(record.letter_annotations["phred_quality"]) / len(record)

if gc_bounds[0] <= gc_percent <= gc_bounds[1] and \
length_bounds[0] <= seq_len <= length_bounds[1] and \
Comment on lines +29 to +30

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Можно было бы еще учесть. что gc_bounds и length_bounds может подаваться одно число, являющееся верхней границей.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Даже не можно а нужно :)

mean_offset >= quality_threshold:
Comment on lines +29 to +31
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if gc_bounds[0] <= gc_percent <= gc_bounds[1] and \
length_bounds[0] <= seq_len <= length_bounds[1] and \
mean_offset >= quality_threshold:
if (
gc_bounds[0] <= gc_percent <= gc_bounds[1]
and length_bounds[0] <= seq_len <= length_bounds[1]
and mean_offset >= quality_threshold
):

Есть такой ЛаЙфХаК, хотя твой способ не осуждаю (но этот имо аккурпатнее)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Согласен, со скобочками по-мне выглядит покрасивше, хотя наверное в этом случае лучше оставлять and в конце строки
но хз

filtered_seqs.append(record)

if output_filename is None:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Хорошо, что проверка есть

output_filename = input_path.split("/")[-1]
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Я бы предложил для таких проверок использовать всё-таки библиотечки типа os или pathlib

else:
output_filename = output_filename + ".fastq"
Comment on lines +35 to +37

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Если в input_path уже написано расширение у файла, то он к этому расширению добавит ".fastq". Также некоторые операционные системы могут не понять "/".
Кажется, лучше сплитовать по ".", брать элемент с индексом 0 и добавлять к нему ".fasta". Тогда output файл будет сохранен в той же папке с одним расширением.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Верное замечание. А вообще чтобы было универсально можно использовать basename

Copy link

@wwoskie wwoskie Mar 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

У меня очень смешанные ощущения от этого места, с одной стороны наказывать забывчивых перезаписью фастку, но если присмотреться, то она запишет в файл с двумя расширениями, в общем я не понял, это баг или фича.....


output_path = output_filename
SeqIO.write(filtered_seqs, output_path, "fastq")

class BiologicalSequence(ABC):
"""
Abstract base class representing a biological sequence.

Defines common operations for biological sequences, such as DNA, RNA, or protein sequences.

Attributes:
sequence (str): The sequence of characters representing the biological sequence.

Methods:
__len__(): Returns the length of the biological sequence.
__getitem__(index): Gets the character at the specified index or returns a subsequence.
__str__(): Returns a string representation of the biological sequence.
is_valid_alphabet(): Checks if all characters in the sequence belong to a valid alphabet.
"""
def __init__(self, sequence):
self.sequence = sequence
if not self.alphabet_checking():
raise ValueError("Invalid characters in the sequence.")

@abstractmethod
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Круто, что ты делаешь самостоятельно эти базовые методы

def __len__(self):
pass

@abstractmethod
def __getitem__(self, index):
Comment on lines +62 to +67

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Отличное использование декораторов!

pass

def __str__(self):
return self.sequence

def __repr__(self):
return f"{self.__class__.__name__}('{self.sequence}')"

def alphabet_checking(self):
if not set(self.sequence) <= set(type(self).ALPHABET):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Лайк

return False
return True


class NucleicAcidSequence(BiologicalSequence):
"""
Represents a nucleic acid sequence, such as DNA or RNA.

Inherits from BiologicalSequence and adds functionality specific to nucleic acid sequences.

Attributes:
sequence (str): The sequence of characters representing the nucleic acid sequence.

Methods:
complement(): Returns the complement of the nucleic acid sequence.
gc_content(as_percentage=False): Calculates the GC content of the nucleic acid sequence.

Overrides:
is_valid_alphabet(): Checks if all characters in the sequence belong to a valid nucleic acid alphabet.
"""

COMPLEMENT_DICT = {
'A': 'T',
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Хорошо, что нижний регистр учитываем

'T': 'A',
'C': 'G',
'G': 'C',
'a': 't',
't': 'a',
'c': 'g',
'g': 'c'}

def __init__(self, sequence):
super().__init__(sequence)

def __len__(self):
return len(self.sequence)

def __getitem__(self, index):
return self.sequence[index]

def complement(self):
return ''.join(self.COMPLEMENT_DICT.get(base, base) for base in self.sequence)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Круто написано.
Можно было бы еще учесть, что на вход подается РНК.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Интересная конструкция

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Хотя это и немного странно, что он вернет сам себя, как будто немного неинтуитивное поведение

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return ''.join(self.COMPLEMENT_DICT.get(base, base) for base in self.sequence)
return self.__class__(''.join(self.COMPLEMENT_DICT.get(base, base) for base in self.sequence))

можно как-то так попробовать, а то результат выполнения возвращает строку, что не совсем хорошо

image

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Чаще делают не self.__class__(...), а type(self)(...)


def gc_content(self, as_percentage=False):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Здорово, что есть вариант с процентами "as_percentage".

gc_count = self.sequence.count('G') + self.sequence.count('C')
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

count в этом случае пробежится два раза, если будет длинная последовательность может быть неприятно.

total_count = len(self.sequence)
gc_content = gc_count / total_count if total_count > 0 else 0
return gc_content * 100 if as_percentage else gc_content
Comment on lines +124 to +125

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Круто, что не забыли про пустые строки и деление на ноль!



class DNASequence(NucleicAcidSequence):
ALPHABET = set("ATGCatgc")

TRANSCRIBE_DICT = {
'T': 'U',
't': 'u'
}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Нативная интеграция (извините)


def __init__(self, sequence):
super().__init__(sequence)

def transcribe(self):
transcribed_seq = ''.join(self.TRANSCRIBE_DICT.get(base, base) for base in self.sequence)
return transcribed_seq


class RNASequence(NucleicAcidSequence):
ALPHABET = set("AUGCaugc")


Comment on lines +146 to +147

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Здесь можно было написать функцию, которая осуществляет трансляцию. Вероятно, Вы просто не успели что-либо написать в этом классе.

class AminoAcidSequence(BiologicalSequence):
"""
This function takes aminoacid sequence and translates in to the RNA.
As most of the aminoacids are coded with several different codons,
this function will take a random codon of the set for such aminoacids.

Arguments:
seq (str): A sequence of RNA molecule

Output:
returns sequence of aminoacids
"""

ALPHABET = set("ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwy")

def __init__(self, sequence):
super().__init__(sequence)

def __len__(self):
return len(self.sequence)

def __getitem__(self, index):
return self.sequence[index]

def translate_to_rna(self):
AA_CODON_DICT = {
"G": ["GGA", "GGU", "GGC", "GGG"],
"R": ["AGA", "AGG", "CGA", "CGC", "CGG", "CGU"],
"S": ["AGC", "AGU", "UCA", "UCC", "UCG", "UCU"],
"E": ["GAA", "GAG"],
"P": ["CCA", "CCC", "CCG", "CCU"],
"L": ["CUA", "CUC", "CUG", "CUU", "UUA", "UUG"],
"V": ["GUA", "GUC", "GUG", "GUU"],
"T": ["ACA", "ACC", "ACG", "ACU"],
"A": ["GCA", "GCC", "GCG", "GCU"],
"I": ["AUA", "AUC", "AUU"],
"F": ["UUC", "UUU"],
"H": ["CAC", "CAU"],
"Y": ["UAC", "UAU"],
"Q": ["CAA", "CAG"],
"C": ["UGC", "UGU"],
"N": ["AAC", "AAU"],
"D": ["GAC", "GAU"],
"K": ["AAA", "AAG"],
"M": ["AUG"],
"W": ["UGG"],
}
rna_seq = ""
for aa in self.sequence:
codon = choice(AA_CODON_DICT[aa])
rna_seq += codon
return rna_seq
Comment on lines +195 to +199

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

А, Вы решили восстанавливать мРНК, тоже хорошая идея.