From 8f9f63f357aa88facebf0b25554497c89360c502 Mon Sep 17 00:00:00 2001
From: Vaishnav88sk <vaishnavsk8804@gmail.com>
Date: Sat, 2 May 2026 14:41:00 +0530
Subject: [PATCH] [ENH] Add augment_complement for RNA/DNA reverse complement
 augmentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add augment_complement() to _augment.py that generates biologically
meaningful reverse complement sequences (A↔U for RNA, A↔T for DNA, C↔G).
Supports both RNA and DNA via molecule_type parameter, preserves case,
and follows the same API pattern as augment_reverse.

Includes 8 new tests covering RNA/DNA complements, case handling,
round-trip validation, multiple arrays, and error handling.
---
 pyaptamer/utils/_augment.py           | 84 ++++++++++++++++++++++++++-
 pyaptamer/utils/tests/test_augment.py | 78 ++++++++++++++++++++++++-
 2 files changed, 160 insertions(+), 2 deletions(-)

diff --git a/pyaptamer/utils/_augment.py b/pyaptamer/utils/_augment.py
index 3a23ac3a..860e2cf5 100644
--- a/pyaptamer/utils/_augment.py
+++ b/pyaptamer/utils/_augment.py
@@ -1,8 +1,12 @@
 __author__ = ["nennomp"]
-__all__ = ["augment_reverse"]
+__all__ = ["augment_reverse", "augment_complement"]
 
 import numpy as np
 
+# Complement mapping tables for DNA and RNA
+_DNA_COMPLEMENT = str.maketrans("ACGTacgt", "TGCAtgca")
+_RNA_COMPLEMENT = str.maketrans("ACGUacgu", "UGCAugca")
+
 
 def augment_reverse(*sequence_arrays: np.ndarray) -> tuple[np.ndarray, ...]:
     """Augment arrays of sequences by adding the reversed sequences.
@@ -27,3 +31,81 @@ def augment_reverse(*sequence_arrays: np.ndarray) -> tuple[np.ndarray, ...]:
         results.append(result)
 
     return tuple(results)
+
+
+def _reverse_complement(sequence: str, molecule_type: str) -> str:
+    """Compute the reverse complement of a single sequence.
+
+    Parameters
+    ----------
+    sequence : str
+        A nucleotide sequence string.
+    molecule_type : str
+        Either ``"rna"`` or ``"dna"``.
+
+    Returns
+    -------
+    str
+        The reverse complement of the input sequence.
+    """
+    table = _RNA_COMPLEMENT if molecule_type == "rna" else _DNA_COMPLEMENT
+    return sequence.translate(table)[::-1]
+
+
+def augment_complement(
+    *sequence_arrays: np.ndarray,
+    molecule_type: str = "rna",
+) -> tuple[np.ndarray, ...]:
+    """Augment arrays of sequences by adding their reverse complements.
+
+    For each input array, the reverse complement of every sequence is
+    computed and concatenated with the originals. This is a biologically
+    meaningful augmentation strategy because the reverse complement
+    represents the complementary strand of a nucleic acid duplex
+    (A↔U for RNA, A↔T for DNA, C↔G for both).
+
+    Parameters
+    ----------
+    *sequence_arrays : np.ndarray
+        Variable number of numpy arrays of sequences (containing strings).
+    molecule_type : str, optional, default="rna"
+        Type of molecule. Must be ``"rna"`` or ``"dna"``.
+        Determines the complement mapping:
+
+        - ``"rna"``: A↔U, C↔G
+        - ``"dna"``: A↔T, C↔G
+
+    Returns
+    -------
+    tuple[np.ndarray, ...]
+        A tuple of arrays, each containing the original sequences followed
+        by their reverse complements.
+
+    Raises
+    ------
+    ValueError
+        If ``molecule_type`` is not ``"rna"`` or ``"dna"``.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from pyaptamer.utils._augment import augment_complement
+    >>> seqs = np.array(["AUGC", "GCAU"])
+    >>> (result,) = augment_complement(seqs, molecule_type="rna")
+    >>> result
+    array(['AUGC', 'GCAU', 'GCAU', 'AUGC'], dtype='<U4')
+    """
+    if molecule_type not in ("rna", "dna"):
+        raise ValueError(
+            f"molecule_type must be 'rna' or 'dna', got '{molecule_type}'."
+        )
+
+    results = []
+    for sequences in sequence_arrays:
+        rc_sequences = np.array(
+            [_reverse_complement(seq, molecule_type) for seq in sequences]
+        )
+        result = np.concatenate([sequences, rc_sequences])
+        results.append(result)
+
+    return tuple(results)
diff --git a/pyaptamer/utils/tests/test_augment.py b/pyaptamer/utils/tests/test_augment.py
index 45e5f942..046f8fa6 100644
--- a/pyaptamer/utils/tests/test_augment.py
+++ b/pyaptamer/utils/tests/test_augment.py
@@ -3,8 +3,9 @@
 __author__ = ["nennomp"]
 
 import numpy as np
+import pytest
 
-from pyaptamer.utils._augment import augment_reverse
+from pyaptamer.utils._augment import augment_complement, augment_reverse
 
 
 def test_augment_reverse_single_array():
@@ -37,3 +38,78 @@ def test_augment_reverse_multiple_arrays():
     np.testing.assert_array_equal(result[0], expected[0])
     np.testing.assert_array_equal(result[1], expected[1])
     np.testing.assert_array_equal(result[2], expected[2])
+
+
+# ---------- Tests for augment_complement ----------
+
+
+class TestAugmentComplement:
+    """Tests for the augment_complement function."""
+
+    def test_rna_complement_basic(self):
+        """Test basic RNA reverse complement: A<->U, C<->G."""
+        seqs = np.array(["AUGC"])
+        (result,) = augment_complement(seqs, molecule_type="rna")
+
+        assert len(result) == 2
+        # reverse complement of AUGC -> complement UACG -> reverse GCAU
+        assert result[0] == "AUGC"
+        assert result[1] == "GCAU"
+
+    def test_dna_complement_basic(self):
+        """Test basic DNA reverse complement: A<->T, C<->G."""
+        seqs = np.array(["ATGC"])
+        (result,) = augment_complement(seqs, molecule_type="dna")
+
+        assert len(result) == 2
+        # reverse complement of ATGC -> complement TACG -> reverse GCAT
+        assert result[0] == "ATGC"
+        assert result[1] == "GCAT"
+
+    def test_rna_complement_preserves_case(self):
+        """Test that case is preserved in complement."""
+        seqs = np.array(["AuGc"])
+        (result,) = augment_complement(seqs, molecule_type="rna")
+
+        assert result[1] == "gCaU"
+
+    def test_complement_doubles_array_size(self):
+        """Test that output is exactly double the input."""
+        seqs = np.array(["AAAA", "CCCC", "GGGG"])
+        (result,) = augment_complement(seqs, molecule_type="rna")
+        assert len(result) == 6
+
+    def test_complement_multiple_arrays(self):
+        """Test augmentation with multiple arrays."""
+        seq1 = np.array(["AUGC"])
+        seq2 = np.array(["GCAU", "AAAA"])
+        r1, r2 = augment_complement(seq1, seq2, molecule_type="rna")
+
+        assert len(r1) == 2
+        assert len(r2) == 4
+
+    def test_complement_invalid_molecule_type(self):
+        """Test that invalid molecule_type raises ValueError."""
+        seqs = np.array(["AUGC"])
+        with pytest.raises(ValueError, match="molecule_type must be"):
+            augment_complement(seqs, molecule_type="protein")
+
+    def test_complement_roundtrip(self):
+        """Test that double complement returns original sequences."""
+        seqs = np.array(["AUGCUAGC"])
+        (first,) = augment_complement(seqs, molecule_type="rna")
+        # first[1] is the reverse complement; applying again should return original
+        rc = first[1]
+        (second,) = augment_complement(np.array([rc]), molecule_type="rna")
+        assert second[1] == seqs[0]
+
+    def test_dna_complement_poly_sequences(self):
+        """Test complement of poly-nucleotide sequences."""
+        seqs = np.array(["AAAA", "TTTT", "CCCC", "GGGG"])
+        (result,) = augment_complement(seqs, molecule_type="dna")
+
+        assert result[4] == "TTTT"  # rc(AAAA) = TTTT
+        assert result[5] == "AAAA"  # rc(TTTT) = AAAA
+        assert result[6] == "GGGG"  # rc(CCCC) = GGGG
+        assert result[7] == "CCCC"  # rc(GGGG) = CCCC
+