From 8f9f63f357aa88facebf0b25554497c89360c502 Mon Sep 17 00:00:00 2001 From: Vaishnav88sk Date: Sat, 2 May 2026 14:41:00 +0530 Subject: [PATCH] [ENH] Add augment_complement for RNA/DNA reverse complement augmentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add augment_complement() to _augment.py that generates biologically meaningful reverse complement sequences (A↔U for RNA, A↔T for DNA, C↔G). Supports both RNA and DNA via molecule_type parameter, preserves case, and follows the same API pattern as augment_reverse. Includes 8 new tests covering RNA/DNA complements, case handling, round-trip validation, multiple arrays, and error handling. --- pyaptamer/utils/_augment.py | 84 ++++++++++++++++++++++++++- pyaptamer/utils/tests/test_augment.py | 78 ++++++++++++++++++++++++- 2 files changed, 160 insertions(+), 2 deletions(-) diff --git a/pyaptamer/utils/_augment.py b/pyaptamer/utils/_augment.py index 3a23ac3a..860e2cf5 100644 --- a/pyaptamer/utils/_augment.py +++ b/pyaptamer/utils/_augment.py @@ -1,8 +1,12 @@ __author__ = ["nennomp"] -__all__ = ["augment_reverse"] +__all__ = ["augment_reverse", "augment_complement"] import numpy as np +# Complement mapping tables for DNA and RNA +_DNA_COMPLEMENT = str.maketrans("ACGTacgt", "TGCAtgca") +_RNA_COMPLEMENT = str.maketrans("ACGUacgu", "UGCAugca") + def augment_reverse(*sequence_arrays: np.ndarray) -> tuple[np.ndarray, ...]: """Augment arrays of sequences by adding the reversed sequences. @@ -27,3 +31,81 @@ def augment_reverse(*sequence_arrays: np.ndarray) -> tuple[np.ndarray, ...]: results.append(result) return tuple(results) + + +def _reverse_complement(sequence: str, molecule_type: str) -> str: + """Compute the reverse complement of a single sequence. + + Parameters + ---------- + sequence : str + A nucleotide sequence string. + molecule_type : str + Either ``"rna"`` or ``"dna"``. + + Returns + ------- + str + The reverse complement of the input sequence. + """ + table = _RNA_COMPLEMENT if molecule_type == "rna" else _DNA_COMPLEMENT + return sequence.translate(table)[::-1] + + +def augment_complement( + *sequence_arrays: np.ndarray, + molecule_type: str = "rna", +) -> tuple[np.ndarray, ...]: + """Augment arrays of sequences by adding their reverse complements. + + For each input array, the reverse complement of every sequence is + computed and concatenated with the originals. This is a biologically + meaningful augmentation strategy because the reverse complement + represents the complementary strand of a nucleic acid duplex + (A↔U for RNA, A↔T for DNA, C↔G for both). + + Parameters + ---------- + *sequence_arrays : np.ndarray + Variable number of numpy arrays of sequences (containing strings). + molecule_type : str, optional, default="rna" + Type of molecule. Must be ``"rna"`` or ``"dna"``. + Determines the complement mapping: + + - ``"rna"``: A↔U, C↔G + - ``"dna"``: A↔T, C↔G + + Returns + ------- + tuple[np.ndarray, ...] + A tuple of arrays, each containing the original sequences followed + by their reverse complements. + + Raises + ------ + ValueError + If ``molecule_type`` is not ``"rna"`` or ``"dna"``. + + Examples + -------- + >>> import numpy as np + >>> from pyaptamer.utils._augment import augment_complement + >>> seqs = np.array(["AUGC", "GCAU"]) + >>> (result,) = augment_complement(seqs, molecule_type="rna") + >>> result + array(['AUGC', 'GCAU', 'GCAU', 'AUGC'], dtype='U, C<->G.""" + seqs = np.array(["AUGC"]) + (result,) = augment_complement(seqs, molecule_type="rna") + + assert len(result) == 2 + # reverse complement of AUGC -> complement UACG -> reverse GCAU + assert result[0] == "AUGC" + assert result[1] == "GCAU" + + def test_dna_complement_basic(self): + """Test basic DNA reverse complement: A<->T, C<->G.""" + seqs = np.array(["ATGC"]) + (result,) = augment_complement(seqs, molecule_type="dna") + + assert len(result) == 2 + # reverse complement of ATGC -> complement TACG -> reverse GCAT + assert result[0] == "ATGC" + assert result[1] == "GCAT" + + def test_rna_complement_preserves_case(self): + """Test that case is preserved in complement.""" + seqs = np.array(["AuGc"]) + (result,) = augment_complement(seqs, molecule_type="rna") + + assert result[1] == "gCaU" + + def test_complement_doubles_array_size(self): + """Test that output is exactly double the input.""" + seqs = np.array(["AAAA", "CCCC", "GGGG"]) + (result,) = augment_complement(seqs, molecule_type="rna") + assert len(result) == 6 + + def test_complement_multiple_arrays(self): + """Test augmentation with multiple arrays.""" + seq1 = np.array(["AUGC"]) + seq2 = np.array(["GCAU", "AAAA"]) + r1, r2 = augment_complement(seq1, seq2, molecule_type="rna") + + assert len(r1) == 2 + assert len(r2) == 4 + + def test_complement_invalid_molecule_type(self): + """Test that invalid molecule_type raises ValueError.""" + seqs = np.array(["AUGC"]) + with pytest.raises(ValueError, match="molecule_type must be"): + augment_complement(seqs, molecule_type="protein") + + def test_complement_roundtrip(self): + """Test that double complement returns original sequences.""" + seqs = np.array(["AUGCUAGC"]) + (first,) = augment_complement(seqs, molecule_type="rna") + # first[1] is the reverse complement; applying again should return original + rc = first[1] + (second,) = augment_complement(np.array([rc]), molecule_type="rna") + assert second[1] == seqs[0] + + def test_dna_complement_poly_sequences(self): + """Test complement of poly-nucleotide sequences.""" + seqs = np.array(["AAAA", "TTTT", "CCCC", "GGGG"]) + (result,) = augment_complement(seqs, molecule_type="dna") + + assert result[4] == "TTTT" # rc(AAAA) = TTTT + assert result[5] == "AAAA" # rc(TTTT) = AAAA + assert result[6] == "GGGG" # rc(CCCC) = GGGG + assert result[7] == "CCCC" # rc(GGGG) = CCCC +