ligandMPNN_FR/gen_prot_lig_dist_cst.py at master · daylight-00/ligandMPNN_FR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from Bio.PDB import PDBParser
import numpy as np

CST_STDERR = 0.5

def extract_dist_cst_from_pdb(pdb_in, lig_tr_atms, bsite_res=''):
    """
    pdb_in      : Path to the PDB file
    lig_tr_atms : List of atom names of interest in the ligand, e.g., ['NE1','O', ...]
    bsite_res   : If '', use all protein CA residues; otherwise, a comma-separated string of residue numbers like '10,15,27'
    return      : ['AtomPair <lig_atm> <het_resno> CA <closest_resno> HARMONIC <dist> 0.5', ...]
    """
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("strc", pdb_in)

    # Assume a single chain/model (add loop if needed)
    model = next(structure.get_models())

    # bsite residue set
    if bsite_res == '':
        # All protein CA residue numbers
        bsite_res_s = {res.get_id()[1] for res in model.get_residues()
                       if res.get_id()[0] == ' ' and 'CA' in res}  # ' ' => ATOM
    else:
        bsite_res_s = {int(x) for x in bsite_res.split(',')}

    # Last protein residue number (for estimating ligand resno)
    last_prot_resno = max(res.get_id()[1] for res in model.get_residues()
                          if res.get_id()[0] == ' ')

    # CA coordinates dict
    bsite_CA_R = {}
    for res in model.get_residues():
        if res.get_id()[0] == ' ' and res.get_id()[1] in bsite_res_s and 'CA' in res:
            bsite_CA_R[res.get_id()[1]] = res['CA'].get_coord()

    # Ligand atom coordinates dict
    het_atm_R = {}
    for res in model.get_residues():
        if res.get_id()[0].startswith('H'):  # HETATM
            for atom in res:
                if atom.get_name() in lig_tr_atms:
                    het_atm_R[atom.get_name()] = atom.get_coord()

    het_resno = last_prot_resno + 1  # Corresponding discontinuous number for diffusion, etc.

    cst_s = []
    bsite_resnos = np.array(list(bsite_CA_R.keys()))
    bsite_coords = np.array([bsite_CA_R[r] for r in bsite_resnos])

    for atm_name in lig_tr_atms:
        het_coord = het_atm_R[atm_name]

        # All CA distances
        diff = bsite_coords - het_coord
        dists = np.sqrt((diff ** 2).sum(axis=1))

        # Closest residue
        idx = np.argmin(dists)
        closest_resno = int(bsite_resnos[idx])
        closest_d = float(dists[idx])

        cst_line = f'AtomPair {atm_name} {het_resno} CA {closest_resno} HARMONIC {closest_d} {CST_STDERR}'
        cst_s.append(cst_line)

    return cst_s