Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 206 additions & 0 deletions ITGA7.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
from __future__ import annotations

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

обратная совместимость? здорово

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Это я возьму себе на заметку. Спасибо :)

import os
import sys
from Bio import SeqIO, SeqUtils, SeqRecord
from Bio.SeqUtils import gc_fraction


class BiologicalSequence:
"""
Represents a generic biological sequence.
Attributes:
- sequence (str): The biological sequence.
Methods:
- __len__(): Returns the length of the sequence.
- __getitem__(index: int): Returns the character at the specified index.
- __str__(): Returns a string representation of the sequence.
- is_valid_alphabet(): Checks if the sequence uses a valid alphabet.
"""

def __init__(self, sequence: str):
self.sequence = sequence

def __len__(self) -> int:
return len(self.sequence)

def __getitem__(self, index: int) -> str:
return self.sequence[index]

def __str__(self) -> str:
return f"{self.sequence}"

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

оригинально, но я бы сделал все же

Suggested change
return f"{self.sequence}"
return str(self.sequence)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Но ведь sequence это уже str

Comment on lines +20 to +30

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dunder'ы можно не аннотировать, но и в их аннотации ничего критичного нет.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ну да, это не имеет смысла, так что лучше не надо, потому что кто-то может не очень понять зачем так делать.


def is_valid_alphabet(self) -> bool:
"""
Checks if the sequence uses a valid alphabet.
"""
try:
unique_chars = set(self.sequence)
return unique_chars <= self.ALPHABET
except AttributeError as e:
raise NotImplementedError(
"Is valid alphabet method not implemented for this class."
) from e


class NucleicAcidSequence(BiologicalSequence):
"""
Represents a nucleic acid sequence.
Methods:
- complement(): Returns the complemented sequence.
- gc_content(percentage: bool = True) -> int | float: Calculates the GC content of the sequence.
Raises:
- NotImplementedError: If complement method is not implemented for this class.
"""

def complement(self) -> DNASequence | RNASequence:
try:
complemented_sequence = "".join(
[self.COMPLEMENT_MAP[base] for base in self.sequence]
)
return self.__class__(complemented_sequence)
except AttributeError as e:
raise NotImplementedError(
"Complement method not implemented for this class."

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

честно говоря, я не очень понял суть ошибки, исходя из ее текста. да и в коде выглядит странно: есть класс, в нем есть метод, но при этом эксепт говорит, что метода нет

) from e

def gc_content(self, percentage: bool = True) -> int | float:
"""
Calculates the GC content of the sequence.
Args:
- percentage (bool): If True, returns the result as a percentage.
Returns:
- int | float: GC content value.
"""
gc_symbols = set("GCgc")
gc_count = sum(1 for nucleotide in self.sequence if nucleotide in gc_symbols)
Comment on lines +74 to +75

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

можно и так, но можно и сяк:

Suggested change
gc_symbols = set("GCgc")
gc_count = sum(1 for nucleotide in self.sequence if nucleotide in gc_symbols)
gc_count = (self.sequence.count('G') +
self.sequence.count('C') +
self.sequence.count('g') +
self.sequence.count('c'))

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ну или вообще привести все к одному регистру локально с помощью .upper(), что выглядит тоже здраво

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Так зато все считается за один прогон по строке. Думаю это не так существенно, один вариант чуть эффективнее, другой - чуть читаемее. Кому что больше нравится.

if percentage:
return (gc_count / self.sequence.__len__()) * 100
else:
return gc_count / self.sequence.__len__()


class DNASequence(NucleicAcidSequence):
"""
Represents a DNA sequence.
Methods:
- transcribe(): Returns the transcribed RNA sequence.
"""

def __init__(self, sequence: str):
super().__init__(sequence)
self.ALPHABET = set("ATGCatgc")
self.COMPLEMENT_MAP = {
"A": "T",
"C": "G",
"G": "C",
"T": "A",
"a": "t",
"c": "g",
"g": "c",
"t": "a",
}
Comment on lines +92 to +101

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Мне кажется, что удобнее записывать буквы в верхнем регистре при их перечислении в константах (gc_symbols, self.ALPHABET, self.COMPLEMENT_MAP и т.д.). Поступающую строку, даже если она в разных регистрах, можно приводить к одному уже внутри методов.
Но это дело вкуса, конечно)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Тут это теперь не константа. Константы это переменные которые объявляются в самом начале скрипта. Это просто классовый атрибут, поэтому его не надо делать капсом.


def transcribe(self) -> RNASequence:
"""
Returns the transcribed RNA sequence.
Returns:
- RNASequence: Transcribed RNA sequence.
"""
transcribed_sequence = self.sequence.replace("T", "U").replace("t", "u")
return RNASequence(transcribed_sequence)


class RNASequence(NucleicAcidSequence):
"""
Represents an RNA sequence.
"""

def __init__(self, sequence: str):
super().__init__(sequence)
self.ALPHABET = set("AUGCaugc")
self.COMPLEMENT_MAP = {
"A": "U",
"C": "G",
"G": "C",
"U": "A",
"a": "u",
"c": "g",
"g": "c",
"u": "a",
}


class AminoAcidSequence(BiologicalSequence):
"""
Represents an amino acid sequence.
"""

def __init__(self, sequence: str):
super().__init__(sequence)
self.ALPHABET = set("GgLlYySsEeQqDdNnFfAaKkRrHhCcVvPpWwIiMmTt")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Я вызвала count_aa с символами BjouXZ, и мне выдало ответ вместо ошибки, что такие символы - не аминокислоты. Возможно, стоит доработать вывод ошибок.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Супер что прогоняешь код!


def count_aa(self) -> dict:
"""
Counts the number of given or all amino acids in a protein sequence.
Arguments:
- seq (str): sequence to count amino acids
- aminoacids (str): which amino acids to count in sequence
Return:
- dict: a dictionary with amino acids and its count
"""

aa_dict_count = {}
for aa in set(self.sequence):
aa_dict_count[aa] = self.sequence.count(aa)
return aa_dict_count


def filter_fastq(
path_to_seqs: str,
output_file_name: str = None,
gc_bounds: tuple | int = (0, 100),
length_bounds: tuple | int = (0, 2**32),
quality_threshold: int = 0,
) -> None:
"""
Filters FASTQ sequences based on the GC-content, length and quality parameters.

Args:
- path_to_seqs (str): the path to the FASTQ file to be filtered.
- output_file_name (str): the name of the file where the filtered FASTQ sequences will be saved.
- gc_bounds (tuple, int): GC content range (in percentages) for filtering. If you pass a single number
to the argument, it is assumed to be an upper bound.
- length_bounds (tuple, int): length range for filtering. If you pass a single number to the argument,
it is assumed to be an upper bound.
- quality_threshold (int): threshold value for average read quality filtering

Returns:
- None: the function doesn't return a value but writes the filtered FASTQ to the output file.
"""
if output_file_name is None:
output_file_name = os.path.basename(path_to_seqs)

records = SeqIO.parse(path_to_seqs, "fastq")

filtered_records = []

for record in records:
gc_content = gc_fraction(record.seq)
Copy link

@rereremin rereremin Mar 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

В этой ситуации можно использовать и другой метод (GC) из класса Bio.SeqUtils, осуществляющий подсчет GC-состава:

Suggested change
gc_content = gc_fraction(record.seq)
gc_content = Bio.SeqUtils.GC(record.seq)

if isinstance(gc_bounds, tuple):
if not (gc_bounds[0] <= gc_content <= gc_bounds[1]):
continue
elif gc_content >= gc_bounds:
continue
sequence_length = len(record.seq)
if isinstance(length_bounds, tuple):
if not (length_bounds[0] <= sequence_length <= length_bounds[1]):
continue
elif sequence_length >= length_bounds:
continue
mean_quality = sum(record.letter_annotations["phred_quality"]) / sequence_length
if mean_quality <= quality_threshold:
continue
filtered_records.append(record)

with open(output_file_name, "w") as output_handle:
SeqIO.write(filtered_records, output_handle, "fastq")