-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_binding_site.py
More file actions
115 lines (98 loc) · 4.87 KB
/
extract_binding_site.py
File metadata and controls
115 lines (98 loc) · 4.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python3
"""
extract_binding_site.py – receptor–peptide interface mapping (May 2025)
For each PDB/mmCIF file:
• Treat the largest protein chain as the *receptor*.
• Treat every other protein chain as a *peptide*.
• Write every receptor residue whose atom is ≤ cut-off Å from any
peptide atom.
Usage (example)
--------------------------------------
python extract_binding_site.py *.pdb \
--cutoff 4.0 --threads 6 --out peptide_interfaces.csv
"""
from __future__ import annotations
import argparse, csv, gzip, os, sys
from concurrent.futures import ThreadPoolExecutor, as_completed
from io import StringIO
from pathlib import Path
from typing import List, Tuple
from Bio.PDB import PDBParser, MMCIFParser, NeighborSearch, Selection # pip install biopython
from tqdm import tqdm # pip install tqdm
WATERS = {"HOH", "WAT", "DOD", "D2O"}
# ──────────────── file helpers (PDB / CIF, plain or .gz) ─────────────── #
def open_text(path: Path):
return gzip.open(path, "rt") if path.suffix == ".gz" else open(path, "rt")
def load_structure(path: Path):
txt = open_text(path).read()
hndl = StringIO(txt)
if path.suffix.lower().endswith(".cif") or ".cif." in path.name:
return MMCIFParser(QUIET=True).get_structure(path.stem, hndl)
return PDBParser(QUIET=True).get_structure(path.stem, hndl)
# ───────────────────────── interface calculation ─────────────────────── #
def largest_chain(struct):
"""Return chain with most residues (ties → most atoms)."""
chains = list(struct.get_chains())
chains.sort(key=lambda c: (len(c), len(list(c.get_atoms()))), reverse=True)
return chains[0]
def protein_atoms(chain):
"""All standard-residue atoms from a chain (no HETATM)."""
return [a for a in chain.get_atoms() if a.get_parent().id[0] == " "]
def find_contacts(path: Path, cutoff: float) -> List[Tuple]:
st = load_structure(path)
rec = largest_chain(st)
rec_atoms = protein_atoms(rec)
pep_atoms = [a for c in st.get_chains() if c != rec
for a in protein_atoms(c)]
if not pep_atoms:
return [] # nothing but the receptor present
neigh = NeighborSearch(rec_atoms) # KD-tree on receptor
hits = set()
for a in pep_atoms:
for nb in neigh.search(a.coord, cutoff):
rres = nb.get_parent()
pchn = a.get_parent().get_parent().id
rcid = rec.id
resid, icode = rres.id[1], rres.id[2].strip() or "-"
hits.add((path.name, rcid, pchn, resid, icode, rres.resname))
return sorted(hits, key=lambda x: (x[1], x[2], x[3]))
# ───────────────────────────────── main ───────────────────────────────── #
def parse_cli(argv=None):
ap = argparse.ArgumentParser(description="Map receptor–peptide interfaces")
ap.add_argument("pdb", nargs="*", default=["**/*.pdb", "**/*.cif", "**/*.pdb.gz", "**/*.cif.gz"],
help="files / globs / directories (defaults recurse for *.pdb/*cif)")
ap.add_argument("--cutoff", type=float, default=4.0,
help="distance cut-off in Å [4.0]")
ap.add_argument("--threads", type=int, default=os.cpu_count(),
help="parallel workers [CPU cores]")
ap.add_argument("--out", default="peptide_interfaces.csv",
help="output CSV filename [peptide_interfaces.csv]")
return ap.parse_args(argv)
def gather_paths(items):
pats = []
for it in items:
p = Path(it)
pats.append(str(p / "**/*") if p.is_dir() else it)
seen = set()
for pat in pats:
for f in Path().glob(pat):
if f.is_file():
seen.add(f)
return sorted(seen)
def main(argv=None):
args = parse_cli(argv)
pdb_files = gather_paths(args.pdb)
if not pdb_files:
sys.exit("✗ No structure files found – check paths/globs.")
with ThreadPoolExecutor(max_workers=args.threads) as pool, \
open(args.out, "w", newline="") as fh:
writer = csv.writer(fh)
writer.writerow(["pdb", "receptor_chain",
"peptide_chain", "resi", "icode", "resn"])
futs = {pool.submit(find_contacts, p, args.cutoff): p for p in pdb_files}
for fut in tqdm(as_completed(futs), total=len(futs),
unit="file", desc="Processing"):
writer.writerows(fut.result())
print(f"✔ {len(pdb_files)} file(s) → {args.out}")
if __name__ == "__main__":
main()