Skip to content

Commit 3310bb0

Browse files
[BUG] Accept AptaCom-style DataFrames in AptaNetPipeline
Support both the legacy DataFrame schema ('aptamer'/'protein') and the AptaCom loader schema ('aptamer_sequence'/'target_sequence') in pairs_to_features so AptaNetPipeline can consume the package's own training DataFrames. Add regression coverage for both supported DataFrame schemas, explicit validation on unsupported schemas, and an end-to-end AptaNetPipeline fit/predict smoke test for the AptaCom-style input.
1 parent 6cdc02c commit 3310bb0

3 files changed

Lines changed: 96 additions & 2 deletions

File tree

pyaptamer/aptanet/tests/test_aptanet.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33

44
import numpy as np
5+
import pandas as pd
56
import pytest
67
from sklearn.utils.estimator_checks import parametrize_with_checks
78

@@ -70,6 +71,32 @@ def test_pipeline_fit_and_predict_regression(aptamer_seq, protein_seq):
7071
assert np.issubdtype(preds.dtype, np.floating)
7172

7273

74+
@pytest.mark.parametrize("aptamer_seq, protein_seq", params)
75+
def test_pipeline_fit_and_predict_with_aptacom_dataframe(aptamer_seq, protein_seq):
76+
"""AptaNetPipeline should accept the AptaCom X schema directly."""
77+
estimator = AptaNetClassifier(
78+
hidden_dim=8,
79+
n_hidden=1,
80+
max_epochs=1,
81+
random_state=0,
82+
)
83+
pipe = AptaNetPipeline(estimator=estimator)
84+
85+
X_raw = pd.DataFrame(
86+
{
87+
"aptamer_sequence": [aptamer_seq for _ in range(10)],
88+
"target_sequence": [protein_seq for _ in range(10)],
89+
}
90+
)
91+
y = np.array([0] * 5 + [1] * 5, dtype=np.float32)
92+
93+
pipe.fit(X_raw, y)
94+
preds = pipe.predict(X_raw)
95+
96+
assert preds.shape == (10,)
97+
assert set(preds).issubset({0, 1})
98+
99+
73100
@parametrize_with_checks(
74101
estimators=[AptaNetClassifier(), AptaNetRegressor()],
75102
expected_failed_checks={

pyaptamer/utils/_aptanet_utils.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@
88

99
from pyaptamer.pseaac import AptaNetPSeAAC
1010

11+
_DATAFRAME_COLUMN_PAIRS = (
12+
("aptamer", "protein"),
13+
("aptamer_sequence", "target_sequence"),
14+
)
15+
1116

1217
def generate_kmer_vecs(aptamer_sequence, k=4):
1318
"""
@@ -57,10 +62,25 @@ def generate_kmer_vecs(aptamer_sequence, k=4):
5762
return kmer_freq
5863

5964

65+
def _resolve_pair_columns(X: pd.DataFrame) -> tuple[str, str]:
66+
"""Resolve supported column names for DataFrame pair inputs."""
67+
for aptamer_col, protein_col in _DATAFRAME_COLUMN_PAIRS:
68+
if {aptamer_col, protein_col}.issubset(X.columns):
69+
return aptamer_col, protein_col
70+
71+
supported = " or ".join(
72+
f"{list(column_pair)!r}" for column_pair in _DATAFRAME_COLUMN_PAIRS
73+
)
74+
raise ValueError(
75+
f"DataFrame input must contain {supported} columns. Got {list(X.columns)!r}."
76+
)
77+
78+
6079
def pairs_to_features(X, k=4):
6180
"""
6281
Convert a list of (aptamer_sequence, protein_sequence) pairs into feature vectors.
63-
Also supports a pandas DataFrame with 'aptamer' and 'protein' columns.
82+
Also supports pandas DataFrames with either ['aptamer', 'protein'] or
83+
['aptamer_sequence', 'target_sequence'] columns.
6484
6585
This function generates feature vectors for each (aptamer, protein) pair using:
6686
@@ -87,7 +107,8 @@ def pairs_to_features(X, k=4):
87107
feats = []
88108

89109
if isinstance(X, pd.DataFrame):
90-
pairs = zip(X["aptamer"], X["protein"], strict=False)
110+
aptamer_col, protein_col = _resolve_pair_columns(X)
111+
pairs = zip(X[aptamer_col], X[protein_col], strict=False)
91112
else:
92113
pairs = X
93114

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import numpy as np
2+
import pandas as pd
3+
import pytest
4+
5+
from pyaptamer.utils._aptanet_utils import pairs_to_features
6+
7+
APTAMER_SEQ = "AGCTTAGCGTACAGCTTAAAAGGGTTTCCCCTGCCCGCGTAC"
8+
PROTEIN_SEQ = "ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWY"
9+
10+
11+
@pytest.mark.parametrize(
12+
("aptamer_col", "protein_col"),
13+
[
14+
("aptamer", "protein"),
15+
("aptamer_sequence", "target_sequence"),
16+
],
17+
)
18+
def test_pairs_to_features_accepts_supported_dataframe_schemas(
19+
aptamer_col, protein_col
20+
):
21+
"""Supported DataFrame schemas should produce the same feature matrix."""
22+
pairs = [(APTAMER_SEQ, PROTEIN_SEQ), (APTAMER_SEQ, PROTEIN_SEQ)]
23+
df = pd.DataFrame(
24+
{
25+
aptamer_col: [APTAMER_SEQ, APTAMER_SEQ],
26+
protein_col: [PROTEIN_SEQ, PROTEIN_SEQ],
27+
}
28+
)
29+
30+
expected = pairs_to_features(pairs)
31+
actual = pairs_to_features(df)
32+
33+
np.testing.assert_allclose(actual, expected)
34+
35+
36+
def test_pairs_to_features_rejects_unknown_dataframe_schema():
37+
"""Unsupported DataFrame schemas should fail with a helpful message."""
38+
df = pd.DataFrame(
39+
{
40+
"aptamer_seq": [APTAMER_SEQ],
41+
"protein_seq": [PROTEIN_SEQ],
42+
}
43+
)
44+
45+
with pytest.raises(ValueError, match="DataFrame input must contain"):
46+
pairs_to_features(df)

0 commit comments

Comments
 (0)