hatch/global_classifier.py at main · kmesiab/hatch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
"""
global_classifier.py — Hatch v7: Global Feature Classifier

Ports the global feature classifier from the original Concept Model Experiment
notebook into the Hatch codebase. Uses whole-protein feature averages (not
sliding windows) with a k-of-7 threshold vote.

This classifier achieves ~85% accuracy on the full dataset (8000-sequence test
set). The goal of v7 is to apply Coordinate Descent threshold calibration to
push toward 87–89%.
"""

import math
import json
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split

# ---------------------------------------------------------------------------
# Amino acid property tables (identical to original notebook)
# ---------------------------------------------------------------------------
KD_HYDRO = {
    'A':  1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C':  2.5,
    'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I':  4.5,
    'L':  3.8, 'K': -3.9, 'M':  1.9, 'F':  2.8, 'P': -1.6,
    'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V':  4.2
}
CHARGE = {
    'A':  0, 'R':  1, 'N':  0, 'D': -1, 'C':  0,
    'Q':  0, 'E': -1, 'G':  0, 'H':  0, 'I':  0,
    'L':  0, 'K':  1, 'M':  0, 'F':  0, 'P':  0,
    'S':  0, 'T':  0, 'W':  0, 'Y':  0, 'V':  0
}
H_DONORS = {
    'A':0,'R':2,'N':2,'D':0,'C':0,'Q':2,'E':0,'G':0,'H':1,'I':0,
    'L':0,'K':1,'M':0,'F':0,'P':0,'S':1,'T':1,'W':1,'Y':1,'V':0
}
H_ACCEPTORS = {
    'A':0,'R':0,'N':2,'D':2,'C':1,'Q':2,'E':2,'G':0,'H':1,'I':0,
    'L':0,'K':0,'M':0,'F':0,'P':0,'S':1,'T':1,'W':0,'Y':1,'V':0
}
FLEXIBILITY = {
    'A': 0.357, 'R': 0.529, 'N': 0.463, 'D': 0.511, 'C': 0.346,
    'Q': 0.493, 'E': 0.497, 'G': 0.544, 'H': 0.323, 'I': 0.462,
    'L': 0.365, 'K': 0.466, 'M': 0.295, 'F': 0.314, 'P': 0.509,
    'S': 0.507, 'T': 0.444, 'W': 0.305, 'Y': 0.420, 'V': 0.386
}

CANONICAL_SET = set(KD_HYDRO.keys())
BULKY_HYDROPHOBICS = {'W', 'C', 'F', 'Y', 'I', 'V', 'L'}

FEATURE_NAMES = [
    "hydro_norm_avg",
    "flex_norm_avg",
    "h_bond_potential_avg",
    "abs_net_charge_prop",
    "shannon_entropy",
    "freq_proline",
    "freq_bulky_hydrophobics",
]

# Pre-compute per-AA properties
AA_PROPS = {}
for aa in CANONICAL_SET:
    AA_PROPS[aa] = {
        'hydro_norm': (KD_HYDRO[aa] + 4.5) / 9.0,
        'flexibility': FLEXIBILITY[aa],
        'h_donors': H_DONORS[aa],
        'h_acceptors': H_ACCEPTORS[aa],
    }

# ---------------------------------------------------------------------------
# Feature computation
# ---------------------------------------------------------------------------

def _aa_composition(seq: str) -> tuple[dict, int]:
    comp = {aa: 0 for aa in CANONICAL_SET}
    valid = 0
    for aa in seq:
        if aa in CANONICAL_SET:
            comp[aa] += 1
            valid += 1
    if valid:
        for aa in comp:
            comp[aa] /= valid
    return comp, valid


def _shannon_entropy(comp: dict) -> float:
    h = 0.0
    for f in comp.values():
        if f > 0:
            h -= f * math.log2(f)
    return h


def compute_global_features(seq: str) -> np.ndarray:
    """Compute the 7 global biophysical features for a full protein sequence."""
    canonical = "".join(aa for aa in seq if aa in CANONICAL_SET)
    if not canonical:
        return np.zeros(7)

    comp, n = _aa_composition(canonical)
    if n == 0:
        return np.zeros(7)

    hydro_sum = flex_sum = hbond_sum = 0.0
    for aa in canonical:
        if aa in AA_PROPS:
            p = AA_PROPS[aa]
            hydro_sum += p['hydro_norm']
            flex_sum  += p['flexibility'] / 0.544   # normalize by max flexibility (Gly)
            hbond_sum += p['h_donors'] + p['h_acceptors']

    net_charge = abs(
        (comp.get('R', 0) + comp.get('K', 0)) -
        (comp.get('D', 0) + comp.get('E', 0))
    )

    return np.array([
        hydro_sum / n,                                          # hydro_norm_avg
        flex_sum  / n,                                          # flex_norm_avg
        hbond_sum / n,                                          # h_bond_potential_avg
        net_charge,                                             # abs_net_charge_prop
        _shannon_entropy(comp),                                 # shannon_entropy
        comp.get('P', 0),                                       # freq_proline
        sum(comp.get(aa, 0) for aa in BULKY_HYDROPHOBICS),      # freq_bulky_hydrophobics
    ])


# ---------------------------------------------------------------------------
# Training: compute midpoint thresholds from folded/disordered medians
# ---------------------------------------------------------------------------

def _load_fasta(path: str, label: int) -> list[dict]:
    records = []
    header = seq = None
    try:
        with open(path) as f:
            for line in f:
                line = line.strip()
                if line.startswith(">"):
                    if header and seq:
                        records.append({'sequence': seq, 'label': label, 'header': header})
                    header = line; seq = ""
                else:
                    seq = (seq or "") + line
        if header and seq:
            records.append({'sequence': seq, 'label': label, 'header': header})
    except FileNotFoundError:
        print(f"Warning: {path} not found.")
    return records


def train_global(
    pdb_fasta: str = "pdb_chains.fasta",
    disprot_fasta: str = "disprot_13000.fasta",
    test_size: float = 0.20,
    random_state: int = 42,
    save_thresholds: str = "global_thresholds_v7.json",
) -> dict:
    """
    Load sequences, compute global features, compute midpoint thresholds,
    and return a metadata dict with train/test splits and thresholds.
    """
    print("Loading sequences...")
    data = []
    data.extend(_load_fasta(pdb_fasta, 1))
    data.extend(_load_fasta(disprot_fasta, 0))
    print(f"  Folded: {sum(1 for d in data if d['label']==1):,}  "
          f"Disordered: {sum(1 for d in data if d['label']==0):,}")

    print("Computing global features...")
    feature_vecs = [compute_global_features(d['sequence']) for d in data]
    X = np.vstack(feature_vecs)
    y = np.array([d['label'] for d in data])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # Compute midpoint thresholds from training set
    df_train = pd.DataFrame(X_train, columns=FEATURE_NAMES)
    df_train['label'] = y_train

    train_means = df_train.groupby('label')[FEATURE_NAMES].mean()
    # label 0 = DisProt, label 1 = PDB (folded)
    medians_folded    = df_train[df_train['label']==1][FEATURE_NAMES].median()
    medians_disordered = df_train[df_train['label']==0][FEATURE_NAMES].median()
    midpoints = ((medians_folded + medians_disordered) / 2).to_dict()

    print("\nGlobal Feature Means (DisProt vs PDB):")
    print(train_means.rename(index={0: 'DisProt', 1: 'PDB'}).to_string())
    print("\nMidpoint Thresholds:")
    for k, v in midpoints.items():
        print(f"  {k:30s} = {v:.4f}")

    # Determine direction for each feature (is folded mean > disordered mean?)
    folded_is_high = {
        feat: (train_means.loc[1, feat] > train_means.loc[0, feat])
        for feat in FEATURE_NAMES
    }

    meta = {
        'X_train': X_train, 'y_train': y_train,
        'X_test':  X_test,  'y_test':  y_test,
        'midpoints': midpoints,
        'folded_is_high': folded_is_high,
        'train_means': train_means,
    }

    if save_thresholds:
        payload = {
            'midpoints': midpoints,
            'folded_is_high': {k: bool(v) for k, v in folded_is_high.items()},
        }
        with open(save_thresholds, 'w') as f:
            json.dump(payload, f, indent=2)
        print(f"\nThresholds saved to {save_thresholds}")

    return meta


# ---------------------------------------------------------------------------
# Classifier
# ---------------------------------------------------------------------------

class GlobalClassifier:
    """
    Global feature threshold-based classifier.

    A protein is predicted FOLDED (1) if it meets >= k_threshold of the 7
    biophysical conditions. Each condition tests whether the global feature
    value is on the folded side of its threshold.
    """

    def __init__(self, thresholds: dict, folded_is_high: dict, k_threshold: int = 5):
        self.thresholds = thresholds        # {feature_name: float}
        self.folded_is_high = folded_is_high  # {feature_name: bool}
        self.k_threshold = k_threshold

    def _count_conditions(self, features: np.ndarray) -> int:
        """Count how many of the 7 conditions the feature vector satisfies."""
        count = 0
        for i, feat in enumerate(FEATURE_NAMES):
            t = self.thresholds[feat]
            if self.folded_is_high[feat]:
                if features[i] >= t:
                    count += 1
            else:
                if features[i] <= t:
                    count += 1
        return count

    def classify(self, seq: str) -> int:
        """Return 1 (FOLDED) or 0 (DISORDERED)."""
        features = compute_global_features(seq)
        return 1 if self._count_conditions(features) >= self.k_threshold else 0

    def score_vector(self, seq: str) -> np.ndarray:
        """Return the raw feature vector for a sequence."""
        return compute_global_features(seq)

    def conditions_met(self, seq: str) -> int:
        """Return the number of conditions met (0–7)."""
        return self._count_conditions(compute_global_features(seq))


def evaluate_global(clf: GlobalClassifier, X_test: np.ndarray, y_test: np.ndarray) -> dict:
    """Evaluate the classifier on a pre-computed feature matrix."""
    preds = np.array([
        1 if clf._count_conditions(X_test[i]) >= clf.k_threshold else 0
        for i in range(len(X_test))
    ])
    cm = confusion_matrix(y_test, preds)
    report = classification_report(
        y_test, preds,
        target_names=["DisProt (0)", "PDB (1)"],
        output_dict=True, zero_division=0
    )
    acc = (cm[0,0] + cm[1,1]) / cm.sum()
    mean_f1 = (report['DisProt (0)']['f1-score'] + report['PDB (1)']['f1-score']) / 2
    return {
        'accuracy': acc,
        'mean_f1': mean_f1,
        'f1_folded': report['PDB (1)']['f1-score'],
        'f1_disordered': report['DisProt (0)']['f1-score'],
        'confusion_matrix': cm,
        'report': report,
        'predictions': preds,
    }


# ---------------------------------------------------------------------------
# CLI: run baseline benchmark
# ---------------------------------------------------------------------------

if __name__ == "__main__":
    import os
    os.chdir(Path(__file__).parent)

    meta = train_global()
    X_test, y_test = meta['X_test'], meta['y_test']

    print("\n\n=== BASELINE GLOBAL CLASSIFIER (midpoint thresholds) ===")
    print(f"Test set: {len(y_test):,} sequences  "
          f"({(y_test==1).sum():,} folded, {(y_test==0).sum():,} disordered)\n")

    results = []
    for k in range(1, 8):
        clf = GlobalClassifier(meta['midpoints'], meta['folded_is_high'], k_threshold=k)
        m = evaluate_global(clf, X_test, y_test)
        results.append({'k': k, **m})
        print(f"k={k}  Accuracy={m['accuracy']:.2%}  "
              f"F1_Folded={m['f1_folded']:.2%}  "
              f"F1_Disordered={m['f1_disordered']:.2%}  "
              f"Mean_F1={m['mean_f1']:.2%}")

    best = max(results, key=lambda r: r['mean_f1'])
    print(f"\nBest k={best['k']}  Mean F1={best['mean_f1']:.2%}  Accuracy={best['accuracy']:.2%}")
    print("\nDetailed report:")
    clf_best = GlobalClassifier(meta['midpoints'], meta['folded_is_high'], k_threshold=best['k'])
    m_best = evaluate_global(clf_best, X_test, y_test)
    print(classification_report(
        y_test, m_best['predictions'],
        target_names=["DisProt (0)", "PDB (1)"], zero_division=0
    ))
    cm = m_best['confusion_matrix']
    print("Confusion Matrix:")
    print(pd.DataFrame(cm,
        index=["Actual DisProt", "Actual PDB"],
        columns=["Pred DisProt", "Pred PDB"]
    ))