Skip to content

enable selection of Nter and Cter states at topology generation #1273

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 139 additions & 4 deletions integration_tests/test_topoaa.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import tempfile
import copy
from pathlib import Path

import pytest
Expand All @@ -20,17 +21,52 @@ def topoaa_module():
yield topoaa


def extract_nter_cter(fpath) -> tuple[str, str]:
"""Helper function to extract nter and cter residues as strings."""
nter, cter = "", ""
nter_resi, cter_resi = None, None
# Read file
with open(fpath, "r") as fin:
# Loop over lines
for _ in fin:
# Make sure we are looking at coordinates
if _.startswith(("ATOM", "HETATM")):
# Extract residue id
resid = _[22:26].strip()
# First residue should be Nter one
if nter_resi is None:
nter_resi = resid
# Last residue should be Cter one
if cter_resi is None:
cter_resi = resid
else:
# Reset last residue to current residue
# until we cannot do this anymore
# should result in extracting last residue
if cter_resi != resid:
# Reset cter residue to empty string
cter = ""
cter_resi = resid
# Increment first residue
if resid == nter_resi:
nter += _
# Increment last residue
cter += _
return nter, cter


def test_topoaa_module_protein(topoaa_module):
"""Topoaa module with protein-protein input"""
topoaa_module.params["molecules"] = [
Path(GOLDEN_DATA, "e2aP_1F3G.pdb"),
Path(GOLDEN_DATA, "hpr_ensemble.pdb"),
]
topoaa_module.params["mol1"] = {"prot_segid": "A"}
topoaa_module.params["mol2"] = {"prot_segid": "B"}
topoaa_module.params["mol1"]["prot_segid"] = "A"
# Create mol2 parameters by copying the ones found for mol1
topoaa_module.params["mol2"] = copy.deepcopy(topoaa_module.params["mol1"])
topoaa_module.params["mol2"]["prot_segid"] = "B"
topoaa_module.params["cns_exec"] = CNS_EXEC
topoaa_module.params["debug"] = True

topoaa_module.run()

expected_inp = Path(topoaa_module.path, "e2aP_1F3G.inp")
Expand Down Expand Up @@ -94,7 +130,7 @@ def test_topoaa_cyclic(topoaa_module):
]
topoaa_module.params["cyclicpept_dist"] = 3.5
topoaa_module.params["disulphide_dist"] = 4.0
topoaa_module.params["mol1"] = {"cyclicpept": True}
topoaa_module.params["mol1"]["cyclicpept"] = True
topoaa_module.params["cns_exec"] = CNS_EXEC
topoaa_module.params["debug"] = True

Expand All @@ -115,3 +151,102 @@ def test_topoaa_cyclic(topoaa_module):

assert "detected" in file_content
assert "disulphide" in file_content


def test_topoaa_module_protein_noCter(topoaa_module):
"""Topoaa module with uncharged Cter and charged Nter."""
topoaa_module.params["molecules"] = [
Path(GOLDEN_DATA, "e2aP_1F3G.pdb"),
]
topoaa_module.params["mol1"]["charged_nter"] = True
topoaa_module.params["mol1"]["charged_cter"] = False
topoaa_module.params["cns_exec"] = CNS_EXEC
topoaa_module.params["debug"] = True
topoaa_module.run()

expected_inp = Path(topoaa_module.path, "e2aP_1F3G.inp")
expected_psf = Path(topoaa_module.path, "e2aP_1F3G_haddock.psf")
expected_pdb = Path(topoaa_module.path, "e2aP_1F3G_haddock.pdb")
expected_gz = Path(topoaa_module.path, "e2aP_1F3G.out.gz")
assert expected_inp.exists()
assert expected_psf.exists()
assert expected_pdb.exists()
assert expected_gz.exists()

nter, cter = extract_nter_cter(expected_pdb)
assert "OXT" not in cter
assert all([nh in nter for nh in ("HT1", "HT2", "HT3",)])


def test_topoaa_module_protein_noNter(topoaa_module):
"""Topoaa module with charged Cter and uncharged Nter."""
topoaa_module.params["molecules"] = [
Path(GOLDEN_DATA, "e2aP_1F3G.pdb"),
]
topoaa_module.params["mol1"]["charged_nter"] = False
topoaa_module.params["mol1"]["charged_cter"] = True
topoaa_module.params["cns_exec"] = CNS_EXEC
topoaa_module.params["debug"] = True
topoaa_module.run()

expected_inp = Path(topoaa_module.path, "e2aP_1F3G.inp")
expected_psf = Path(topoaa_module.path, "e2aP_1F3G_haddock.psf")
expected_pdb = Path(topoaa_module.path, "e2aP_1F3G_haddock.pdb")
expected_gz = Path(topoaa_module.path, "e2aP_1F3G.out.gz")
assert expected_inp.exists()
assert expected_psf.exists()
assert expected_pdb.exists()
assert expected_gz.exists()

nter, cter = extract_nter_cter(expected_pdb)
assert "OXT" in cter
assert not any([nh in nter for nh in ("HT1", "HT2", "HT3",)])

def test_topoaa_module_protein_noter(topoaa_module):
"""Topoaa module without charged termini."""
topoaa_module.params["molecules"] = [
Path(GOLDEN_DATA, "e2aP_1F3G.pdb"),
]
topoaa_module.params["mol1"]["charged_nter"] = False
topoaa_module.params["mol1"]["charged_cter"] = False
topoaa_module.params["cns_exec"] = CNS_EXEC
topoaa_module.params["debug"] = True
topoaa_module.run()

expected_inp = Path(topoaa_module.path, "e2aP_1F3G.inp")
expected_psf = Path(topoaa_module.path, "e2aP_1F3G_haddock.psf")
expected_pdb = Path(topoaa_module.path, "e2aP_1F3G_haddock.pdb")
expected_gz = Path(topoaa_module.path, "e2aP_1F3G.out.gz")
assert expected_inp.exists()
assert expected_psf.exists()
assert expected_pdb.exists()
assert expected_gz.exists()

nter, cter = extract_nter_cter(expected_pdb)
assert "OXT" not in cter
assert not any([nh in nter for nh in ("HT1", "HT2", "HT3",)])


def test_topoaa_module_protein_charged_ters(topoaa_module):
"""Topoaa module with charged termini."""
topoaa_module.params["molecules"] = [
Path(GOLDEN_DATA, "e2aP_1F3G.pdb"),
]
topoaa_module.params["mol1"]["charged_nter"] = True
topoaa_module.params["mol1"]["charged_cter"] = True
topoaa_module.params["cns_exec"] = CNS_EXEC
topoaa_module.params["debug"] = True
topoaa_module.run()

expected_inp = Path(topoaa_module.path, "e2aP_1F3G.inp")
expected_psf = Path(topoaa_module.path, "e2aP_1F3G_haddock.psf")
expected_pdb = Path(topoaa_module.path, "e2aP_1F3G_haddock.pdb")
expected_gz = Path(topoaa_module.path, "e2aP_1F3G.out.gz")
assert expected_inp.exists()
assert expected_psf.exists()
assert expected_pdb.exists()
assert expected_gz.exists()

nter, cter = extract_nter_cter(expected_pdb)
assert "OXT" in cter
assert all([nh in nter for nh in ("HT1", "HT2", "HT3",)])
11 changes: 11 additions & 0 deletions src/haddock/core/cns_paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,17 @@
SCATTER_LIB = "scatter.lib"
INITIAL_POSITIONS_DIR = "initial_positions"

PROTEIN_LINK_FILES = {
"NH3+,COO-": "protein-allhdg5-4.link",
"NH,COO-": "protein-allhdg5-4-noNter.link",
"NH3+,CO": "protein-allhdg5-4-noCter.link",
"NH,CO": LINK_FILE,
}
NUCL_LINK_FILES = {
"5'Phosphate": "dna-rna-pho-1.3.link",
"5'Sugar": "dna-rna-1.3.link",
}

# default prepared paths
link_file = Path(toppar_path, LINK_FILE)
scatter_lib = Path(toppar_path, SCATTER_LIB)
Expand Down
40 changes: 37 additions & 3 deletions src/haddock/libs/libcns.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,36 @@ def generate_default_header(
)


def find_desired_linkfiles(
charged_nter: bool = False,
charged_cter: bool = False,
phosphate_5: bool = False,
path: Optional[FilePath] = None,
) -> dict[str, Path]:
# Set output variable
linkfiles = {}
# Logic to find appropriate link for proteins
if charged_nter and charged_cter:
prot_link_key = "NH3+,COO-"
elif not charged_nter and charged_cter:
prot_link_key = "NH,COO-"
elif charged_nter and not charged_cter:
prot_link_key= "NH3+,CO"
elif not charged_nter and not charged_cter:
prot_link_key = "NH,CO"
# Point to corresponding file
linkfiles["prot_link_infile"] = cns_paths.PROTEIN_LINK_FILES[prot_link_key]

# Logic to find linkfile for dna
nucl_link_key = "5'Phosphate" if phosphate_5 else "5'Sugar"
# Point to corresponding file
linkfiles["nucl_link_infile"] = cns_paths.NUCL_LINK_FILES[nucl_link_key]
# Converts to real paths
if path is not None:
linkfiles = {key: Path(path, p) for key, p in linkfiles.items()}
return linkfiles


def _is_nan(x: Any) -> bool:
"""Inspect if is nan."""
try:
Expand Down Expand Up @@ -116,10 +146,13 @@ def load_workflow_params(

Returns
-------
str
param_header: str
The string with the CNS parameters defined.
"""
non_empty_parameters = ((k, v) for k, v in params.items() if filter_empty_vars(v))
non_empty_parameters = (
(k, v) for k, v in params.items()
if filter_empty_vars(v)
)

# types besides the ones in the if-statements should not enter this loop
for param, v in non_empty_parameters:
Expand Down Expand Up @@ -160,7 +193,8 @@ def write_eval_line(param: Any, value: Any, eval_line: str = "eval (${}={})") ->
def load_link(mol_link: Path) -> str:
"""Add the link header."""
return load_workflow_params(
param_header=f"{linesep}! Link file{linesep}", link_file=mol_link
param_header=f"{linesep}! Link file{linesep}",
prot_link_infile=mol_link,
)


Expand Down
20 changes: 16 additions & 4 deletions src/haddock/modules/topology/topoaa/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from haddock.core.typing import FilePath, Optional, ParamDict, ParamMap, Union
from haddock.libs import libpdb
from haddock.libs.libcns import (
find_desired_linkfiles,
generate_default_header,
load_workflow_params,
prepare_output,
Expand Down Expand Up @@ -74,20 +75,20 @@ def generate_topology(
)

input_str = prepare_single_input(str(input_pdb))

# Order CNS statements
inp_parts = (
general_param,
input_str,
output,
link,
#link,
trans_vec,
tensor,
scatter,
axis,
water_box,
recipe_str,
)

# Combine all CNS statments as a single string
inp = "".join(inp_parts)

if write_to_disk:
Expand Down Expand Up @@ -231,7 +232,18 @@ def _run(self) -> None:
# molecule parameters are shared among models of the same molecule
parameters_for_this_molecule = mol_params[mol_params_get()]

for task_id, model in enumerate(splited_models):
# Find appropriate link files for this molecule
link_files = find_desired_linkfiles(
charged_nter=parameters_for_this_molecule.pop("charged_nter"),
charged_cter=parameters_for_this_molecule.pop("charged_cter"),
phosphate_5=False,
path=self.toppar_path,
)
# Update molecule parameters with full path to link files
parameters_for_this_molecule.update(link_files)

# Loop over molecules conformations
for _task_id, model in enumerate(splited_models):
self.log(f"Sanitizing molecule {model.name}")
models_dic[i].append(model)

Expand Down
8 changes: 6 additions & 2 deletions src/haddock/modules/topology/topoaa/cns/generate-topology.cns
Original file line number Diff line number Diff line change
Expand Up @@ -328,13 +328,17 @@ segment
if ( &separate = true ) then
separate=true
end if
!if ( &BLANK%prot_link_infile = false ) then
if ( &BLANK%prot_link_infile = false ) then
@@$prot_link_infile
else
@@&prot_link_infile
!end if
end if
if ( &BLANK%ion_link_infile = false ) then
@@&ion_link_infile
end if
if ( &BLANK%nucl_link_infile = false ) then
@@$nucl_link_infile
else
@@&nucl_link_infile
end if
if ( &BLANK%carbo_link_infile = false ) then
Expand Down
34 changes: 26 additions & 8 deletions src/haddock/modules/topology/topoaa/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,14 @@ disulphide_dist:
group: molecule
explevel: guru
mol1:
type: dict
group: input molecules
explevel: easy
title: Input molecule configuration
short: Specific molecule configuration
long: You can expand this parameter and associated sub-parameters to the other input molecules. For example,
if you input three molecules, you can define mol1, mol2, and mol3
subparameters. Those not defined will be populated with the defaults.
cyclicpept:
default: false
type: boolean
Expand Down Expand Up @@ -171,14 +179,24 @@ mol1:
long: Residue number of the Histidine to be defined as HISE
group: molecule
explevel: expert
type: dict
group: input molecules
explevel: easy
title: Input molecule configuration
short: Specific molecule configuration
long: You can expand this parameter and associated sub-parameters to the other input molecules. For example,
if you input three molecules, you can define mol1, mol2, and mol3
subparameters. Those not defined will be populated with the defaults.
charged_nter:
default: false
type: boolean
title: N-ter topology
short: Defines how N-terminus residue should be
long: This option defines how N-terminus residue should be. If false (default), N-ter will be an uncharged NH.
as it would be in a peptide bond. If true, N-ter will be a charged NH3+.
group: molecule
explevel: easy
charged_cter:
default: false
type: boolean
title: C-ter topology
short: Defines how C-terminus residue should be
long: This option defines how N-terminus residue should be. If false (default), C-ter will be an uncharged CO,
as it would be in a peptide bond. If true, C-ter will be a charged COO-.
group: molecule
explevel: easy
molecules:
default: false
type: boolean
Expand Down