Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 83 additions & 1 deletion pyaptamer/utils/_augment.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
__author__ = ["nennomp"]
__all__ = ["augment_reverse"]
__all__ = ["augment_reverse", "augment_complement"]

import numpy as np

# Complement mapping tables for DNA and RNA
_DNA_COMPLEMENT = str.maketrans("ACGTacgt", "TGCAtgca")
_RNA_COMPLEMENT = str.maketrans("ACGUacgu", "UGCAugca")


def augment_reverse(*sequence_arrays: np.ndarray) -> tuple[np.ndarray, ...]:
"""Augment arrays of sequences by adding the reversed sequences.
Expand All @@ -27,3 +31,81 @@ def augment_reverse(*sequence_arrays: np.ndarray) -> tuple[np.ndarray, ...]:
results.append(result)

return tuple(results)


def _reverse_complement(sequence: str, molecule_type: str) -> str:
"""Compute the reverse complement of a single sequence.

Parameters
----------
sequence : str
A nucleotide sequence string.
molecule_type : str
Either ``"rna"`` or ``"dna"``.

Returns
-------
str
The reverse complement of the input sequence.
"""
table = _RNA_COMPLEMENT if molecule_type == "rna" else _DNA_COMPLEMENT
return sequence.translate(table)[::-1]


def augment_complement(
*sequence_arrays: np.ndarray,
molecule_type: str = "rna",
) -> tuple[np.ndarray, ...]:
"""Augment arrays of sequences by adding their reverse complements.

For each input array, the reverse complement of every sequence is
computed and concatenated with the originals. This is a biologically
meaningful augmentation strategy because the reverse complement
represents the complementary strand of a nucleic acid duplex
(A↔U for RNA, A↔T for DNA, C↔G for both).

Parameters
----------
*sequence_arrays : np.ndarray
Variable number of numpy arrays of sequences (containing strings).
molecule_type : str, optional, default="rna"
Type of molecule. Must be ``"rna"`` or ``"dna"``.
Determines the complement mapping:

- ``"rna"``: A↔U, C↔G
- ``"dna"``: A↔T, C↔G

Returns
-------
tuple[np.ndarray, ...]
A tuple of arrays, each containing the original sequences followed
by their reverse complements.

Raises
------
ValueError
If ``molecule_type`` is not ``"rna"`` or ``"dna"``.

Examples
--------
>>> import numpy as np
>>> from pyaptamer.utils._augment import augment_complement
>>> seqs = np.array(["AUGC", "GCAU"])
>>> (result,) = augment_complement(seqs, molecule_type="rna")
>>> result
array(['AUGC', 'GCAU', 'GCAU', 'AUGC'], dtype='<U4')
"""
if molecule_type not in ("rna", "dna"):
raise ValueError(
f"molecule_type must be 'rna' or 'dna', got '{molecule_type}'."
)

results = []
for sequences in sequence_arrays:
rc_sequences = np.array(
[_reverse_complement(seq, molecule_type) for seq in sequences]
)
result = np.concatenate([sequences, rc_sequences])
results.append(result)

return tuple(results)
78 changes: 77 additions & 1 deletion pyaptamer/utils/tests/test_augment.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
__author__ = ["nennomp"]

import numpy as np
import pytest

from pyaptamer.utils._augment import augment_reverse
from pyaptamer.utils._augment import augment_complement, augment_reverse


def test_augment_reverse_single_array():
Expand Down Expand Up @@ -37,3 +38,78 @@ def test_augment_reverse_multiple_arrays():
np.testing.assert_array_equal(result[0], expected[0])
np.testing.assert_array_equal(result[1], expected[1])
np.testing.assert_array_equal(result[2], expected[2])


# ---------- Tests for augment_complement ----------


class TestAugmentComplement:
"""Tests for the augment_complement function."""

def test_rna_complement_basic(self):
"""Test basic RNA reverse complement: A<->U, C<->G."""
seqs = np.array(["AUGC"])
(result,) = augment_complement(seqs, molecule_type="rna")

assert len(result) == 2
# reverse complement of AUGC -> complement UACG -> reverse GCAU
assert result[0] == "AUGC"
assert result[1] == "GCAU"

def test_dna_complement_basic(self):
"""Test basic DNA reverse complement: A<->T, C<->G."""
seqs = np.array(["ATGC"])
(result,) = augment_complement(seqs, molecule_type="dna")

assert len(result) == 2
# reverse complement of ATGC -> complement TACG -> reverse GCAT
assert result[0] == "ATGC"
assert result[1] == "GCAT"

def test_rna_complement_preserves_case(self):
"""Test that case is preserved in complement."""
seqs = np.array(["AuGc"])
(result,) = augment_complement(seqs, molecule_type="rna")

assert result[1] == "gCaU"

def test_complement_doubles_array_size(self):
"""Test that output is exactly double the input."""
seqs = np.array(["AAAA", "CCCC", "GGGG"])
(result,) = augment_complement(seqs, molecule_type="rna")
assert len(result) == 6

def test_complement_multiple_arrays(self):
"""Test augmentation with multiple arrays."""
seq1 = np.array(["AUGC"])
seq2 = np.array(["GCAU", "AAAA"])
r1, r2 = augment_complement(seq1, seq2, molecule_type="rna")

assert len(r1) == 2
assert len(r2) == 4

def test_complement_invalid_molecule_type(self):
"""Test that invalid molecule_type raises ValueError."""
seqs = np.array(["AUGC"])
with pytest.raises(ValueError, match="molecule_type must be"):
augment_complement(seqs, molecule_type="protein")

def test_complement_roundtrip(self):
"""Test that double complement returns original sequences."""
seqs = np.array(["AUGCUAGC"])
(first,) = augment_complement(seqs, molecule_type="rna")
# first[1] is the reverse complement; applying again should return original
rc = first[1]
(second,) = augment_complement(np.array([rc]), molecule_type="rna")
assert second[1] == seqs[0]

def test_dna_complement_poly_sequences(self):
"""Test complement of poly-nucleotide sequences."""
seqs = np.array(["AAAA", "TTTT", "CCCC", "GGGG"])
(result,) = augment_complement(seqs, molecule_type="dna")

assert result[4] == "TTTT" # rc(AAAA) = TTTT
assert result[5] == "AAAA" # rc(TTTT) = AAAA
assert result[6] == "GGGG" # rc(CCCC) = GGGG
assert result[7] == "CCCC" # rc(GGGG) = CCCC