[BUG] Accept AptaCom-style DataFrames in AptaNetPipeline

officialasishkumar · officialasishkumar · commit 3310bb0bfabf · 2026-04-06T18:11:58.000Z
Support both the legacy DataFrame schema ('aptamer'/'protein') and the AptaCom loader schema ('aptamer_sequence'/'target_sequence') in pairs_to_features so AptaNetPipeline can consume the package's own training DataFrames.

Add regression coverage for both supported DataFrame schemas, explicit validation on unsupported schemas, and an end-to-end AptaNetPipeline fit/predict smoke test for the AptaCom-style input.
diff --git a/pyaptamer/aptanet/tests/test_aptanet.py b/pyaptamer/aptanet/tests/test_aptanet.py
@@ -2,6 +2,7 @@
 
 
 import numpy as np
+import pandas as pd
 import pytest
 from sklearn.utils.estimator_checks import parametrize_with_checks
 
@@ -70,6 +71,32 @@ def test_pipeline_fit_and_predict_regression(aptamer_seq, protein_seq):
     assert np.issubdtype(preds.dtype, np.floating)
 
 
+@pytest.mark.parametrize("aptamer_seq, protein_seq", params)
+def test_pipeline_fit_and_predict_with_aptacom_dataframe(aptamer_seq, protein_seq):
+    """AptaNetPipeline should accept the AptaCom X schema directly."""
+    estimator = AptaNetClassifier(
+        hidden_dim=8,
+        n_hidden=1,
+        max_epochs=1,
+        random_state=0,
+    )
+    pipe = AptaNetPipeline(estimator=estimator)
+
+    X_raw = pd.DataFrame(
+        {
+            "aptamer_sequence": [aptamer_seq for _ in range(10)],
+            "target_sequence": [protein_seq for _ in range(10)],
+        }
+    )
+    y = np.array([0] * 5 + [1] * 5, dtype=np.float32)
+
+    pipe.fit(X_raw, y)
+    preds = pipe.predict(X_raw)
+
+    assert preds.shape == (10,)
+    assert set(preds).issubset({0, 1})
+
+
 @parametrize_with_checks(
     estimators=[AptaNetClassifier(), AptaNetRegressor()],
     expected_failed_checks={
diff --git a/pyaptamer/utils/_aptanet_utils.py b/pyaptamer/utils/_aptanet_utils.py
@@ -8,6 +8,11 @@
 
 from pyaptamer.pseaac import AptaNetPSeAAC
 
+_DATAFRAME_COLUMN_PAIRS = (
+    ("aptamer", "protein"),
+    ("aptamer_sequence", "target_sequence"),
+)
+
 
 def generate_kmer_vecs(aptamer_sequence, k=4):
     """
@@ -57,10 +62,25 @@ def generate_kmer_vecs(aptamer_sequence, k=4):
     return kmer_freq
 
 
+def _resolve_pair_columns(X: pd.DataFrame) -> tuple[str, str]:
+    """Resolve supported column names for DataFrame pair inputs."""
+    for aptamer_col, protein_col in _DATAFRAME_COLUMN_PAIRS:
+        if {aptamer_col, protein_col}.issubset(X.columns):
+            return aptamer_col, protein_col
+
+    supported = " or ".join(
+        f"{list(column_pair)!r}" for column_pair in _DATAFRAME_COLUMN_PAIRS
+    )
+    raise ValueError(
+        f"DataFrame input must contain {supported} columns. Got {list(X.columns)!r}."
+    )
+
+
 def pairs_to_features(X, k=4):
     """
     Convert a list of (aptamer_sequence, protein_sequence) pairs into feature vectors.
-    Also supports a pandas DataFrame with 'aptamer' and 'protein' columns.
+    Also supports pandas DataFrames with either ['aptamer', 'protein'] or
+    ['aptamer_sequence', 'target_sequence'] columns.
 
     This function generates feature vectors for each (aptamer, protein) pair using:
 
@@ -87,7 +107,8 @@ def pairs_to_features(X, k=4):
     feats = []
 
     if isinstance(X, pd.DataFrame):
-        pairs = zip(X["aptamer"], X["protein"], strict=False)
+        aptamer_col, protein_col = _resolve_pair_columns(X)
+        pairs = zip(X[aptamer_col], X[protein_col], strict=False)
     else:
         pairs = X
 
diff --git a/pyaptamer/utils/tests/test_aptanet_utils.py b/pyaptamer/utils/tests/test_aptanet_utils.py
@@ -0,0 +1,46 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from pyaptamer.utils._aptanet_utils import pairs_to_features
+
+APTAMER_SEQ = "AGCTTAGCGTACAGCTTAAAAGGGTTTCCCCTGCCCGCGTAC"
+PROTEIN_SEQ = "ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWY"
+
+
+@pytest.mark.parametrize(
+    ("aptamer_col", "protein_col"),
+    [
+        ("aptamer", "protein"),
+        ("aptamer_sequence", "target_sequence"),
+    ],
+)
+def test_pairs_to_features_accepts_supported_dataframe_schemas(
+    aptamer_col, protein_col
+):
+    """Supported DataFrame schemas should produce the same feature matrix."""
+    pairs = [(APTAMER_SEQ, PROTEIN_SEQ), (APTAMER_SEQ, PROTEIN_SEQ)]
+    df = pd.DataFrame(
+        {
+            aptamer_col: [APTAMER_SEQ, APTAMER_SEQ],
+            protein_col: [PROTEIN_SEQ, PROTEIN_SEQ],
+        }
+    )
+
+    expected = pairs_to_features(pairs)
+    actual = pairs_to_features(df)
+
+    np.testing.assert_allclose(actual, expected)
+
+
+def test_pairs_to_features_rejects_unknown_dataframe_schema():
+    """Unsupported DataFrame schemas should fail with a helpful message."""
+    df = pd.DataFrame(
+        {
+            "aptamer_seq": [APTAMER_SEQ],
+            "protein_seq": [PROTEIN_SEQ],
+        }
+    )
+
+    with pytest.raises(ValueError, match="DataFrame input must contain"):
+        pairs_to_features(df)