From a3a8a3415b62a02162a68935c8257c524cbdf041 Mon Sep 17 00:00:00 2001
From: Brandon Duane Walker <walkerbd@UPDATEME.HOSTNAME.COM>
Date: Wed, 28 Feb 2024 14:28:37 -0500
Subject: [PATCH] extract protein and ligand in same workflow step

---
 .github/workflows/docker_build.yml            |   2 +-
 cwl_adapters/extract_ligand_protein.cwl       |  85 ++++++++++++++
 cwl_adapters/extract_protein.cwl              |   2 +-
 docker/dockerBuild.sh                         |   1 +
 dockerPull.sh                                 |   1 +
 .../scripts/Dockerfile_extract_ligand_protein |   7 ++
 examples/scripts/extract_ligand_protein.py    | 111 ++++++++++++++++++
 7 files changed, 207 insertions(+), 2 deletions(-)
 create mode 100644 cwl_adapters/extract_ligand_protein.cwl
 create mode 100644 examples/scripts/Dockerfile_extract_ligand_protein
 create mode 100644 examples/scripts/extract_ligand_protein.py

diff --git a/.github/workflows/docker_build.yml b/.github/workflows/docker_build.yml
index 5b6f4de..178f886 100644
--- a/.github/workflows/docker_build.yml
+++ b/.github/workflows/docker_build.yml
@@ -26,7 +26,7 @@ jobs:
                      rename_residues_mol, combine_structure,
                      remove_terminal_residue_name_prefixes, molgan,
                      pdbbind_refined, onionnet-sfct, smina, pdbfixer,
-                     fix_pdb_atom_column, extract_protein, generate_conformers]  # No username for pdbind_refined
+                     fix_pdb_atom_column, extract_protein, extract_ligand_protein, generate_conformers]  # No username for pdbind_refined
         # skip data/ and cwl_adapters/file_format_conversions/biosimspace/
     runs-on: [ubuntu-latest]
 
diff --git a/cwl_adapters/extract_ligand_protein.cwl b/cwl_adapters/extract_ligand_protein.cwl
new file mode 100644
index 0000000..58423fb
--- /dev/null
+++ b/cwl_adapters/extract_ligand_protein.cwl
@@ -0,0 +1,85 @@
+#!/usr/bin/env cwl-runner
+cwlVersion: v1.0
+
+class: CommandLineTool
+
+label: A tool that employs OpenMM to extract ligands and protein from a PDB file
+
+doc: |-
+  A tool that employs OpenMM to extract ligands and protein from a PDB file
+
+baseCommand: ['python', '/extract_ligand_protein.py']
+
+hints:
+  DockerRequirement:
+    dockerPull: mrbrandonwalker/extract_ligand_protein
+
+inputs:
+  input_pdb_path:
+    label: Input pdb file path
+    doc: |-
+      Input pdb file path
+      Type: string
+      File type: input
+      Accepted formats: pdb
+      Example file: https://github.com/bioexcel/biobb_structure_utils/raw/master/biobb_structure_utils/test/data/utils/cat_protein.pdb
+    type: File
+    format:
+    - edam:format_1476
+    inputBinding:
+      prefix: --input_pdb_path
+
+  output_pdb_path:
+    label: Output pdb file path
+    doc: |-
+      Output pdb file path
+      Type: string
+      File type: output
+      Accepted formats: pdb
+      Example file: https://github.com/bioexcel/biobb_structure_utils/raw/master/biobb_structure_utils/test/reference/utils/ref_cat_pdb.pdb
+    type: string
+    format:
+    - edam:format_1476
+    inputBinding:
+      prefix: --output_pdb_path
+    default: system.pdb
+
+  output_pdb_ligand_path:
+    label: Output pdb ligand file path
+    doc: |-
+      Output pdb ligand file path
+      Type: string
+      File type: output
+      Accepted formats: sdf
+    type: string
+    format:
+    - edam:format_1476
+    inputBinding:
+      prefix: --output_pdb_ligand_path
+    default: ligand_system.pdb
+
+outputs:
+  output_pdb_path:
+    label: Output pdb file path
+    doc: |-
+      Output pdb file path
+    type: File
+    outputBinding:
+      glob: $(inputs.output_pdb_path)
+    format: edam:format_1476
+
+  output_pdb_ligand_path:
+    label: Output ligand pdb file path
+    doc: |-
+      Output ligand pdb file path
+      Use optional File? since ligand may not exist in complex
+    type: File?
+    outputBinding:
+      glob: $(inputs.output_pdb_ligand_path)
+    format: edam:format_1476
+
+$namespaces:
+  edam: https://edamontology.org/
+
+$schemas:
+- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl
diff --git a/cwl_adapters/extract_protein.cwl b/cwl_adapters/extract_protein.cwl
index 9103d9c..50f722a 100644
--- a/cwl_adapters/extract_protein.cwl
+++ b/cwl_adapters/extract_protein.cwl
@@ -58,4 +58,4 @@ $namespaces:
   edam: https://edamontology.org/
 
 $schemas:
-- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl
+- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl
\ No newline at end of file
diff --git a/docker/dockerBuild.sh b/docker/dockerBuild.sh
index 4f3411a..ca33b52 100755
--- a/docker/dockerBuild.sh
+++ b/docker/dockerBuild.sh
@@ -34,6 +34,7 @@ sudo docker build --no-cache --pull -f Dockerfile_onionnet-sfct -t polusai/onion
 sudo docker build --no-cache --pull -f Dockerfile_smina -t cyangnyu/smina .
 sudo docker build --no-cache --pull -f Dockerfile_pdbfixer -t ndonyapour/pdbfixer .
 sudo docker build --no-cache --pull -f Dockerfile_extract_protein -t ndonyapour/extract_protein .
+sudo docker build --no-cache --pull -f Dockerfile_extract_ligand_protein -t mrbrandonwalker/extract_ligand_protein .
 sudo docker build --no-cache --pull -f Dockerfile_fix_pdb_atom_column -t ndonyapour/fix_pdb_atom_column .
 sudo docker build --no-cache --pull -f Dockerfile_generate_conformers -t ndonyapour/generate_conformers .
 
diff --git a/dockerPull.sh b/dockerPull.sh
index f173c7b..3419792 100755
--- a/dockerPull.sh
+++ b/dockerPull.sh
@@ -28,5 +28,6 @@ docker pull mrbrandonwalker/diffdock_gpu
 docker pull mrbrandonwalker/diffdock_cpu
 docker pull ndonyapour/pdbfixer
 docker pull ndonyapour/extract_protein
+docker pull mrbrandonwalker/extract_ligand_protein
 docker pull ndonyapour/fix_pdb_atom_column
 docker pull ndonyapour/generate_conformers
\ No newline at end of file
diff --git a/examples/scripts/Dockerfile_extract_ligand_protein b/examples/scripts/Dockerfile_extract_ligand_protein
new file mode 100644
index 0000000..2454a30
--- /dev/null
+++ b/examples/scripts/Dockerfile_extract_ligand_protein
@@ -0,0 +1,7 @@
+# docker build -f Dockerfile_extract_ligand_protein -t mrbrandonwalker/extract_ligand_protein .
+FROM condaforge/mambaforge
+# NOT mambaforge-pypy3 (mdanalysis is incompatible with pypy)
+RUN mamba install mdanalysis
+
+ADD extract_ligand_protein.py .
+ADD Dockerfile_extract_ligand_protein .
\ No newline at end of file
diff --git a/examples/scripts/extract_ligand_protein.py b/examples/scripts/extract_ligand_protein.py
new file mode 100644
index 0000000..8670736
--- /dev/null
+++ b/examples/scripts/extract_ligand_protein.py
@@ -0,0 +1,111 @@
+# pylint: disable=no-member
+import sys
+import os
+import argparse
+
+import MDAnalysis as mda
+
+
+def parse_arguments() -> argparse.Namespace:
+    """ This function parses the arguments.
+
+    Returns:
+        argparse.Namespace: The command line arguments
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_pdb_path', type=str)
+    parser.add_argument('--output_pdb_path', type=str)
+    parser.add_argument('--output_pdb_ligand_path', type=str)
+    args = parser.parse_args()
+    return args
+
+
+def extract_ligand_protein(input_pdb_path: str, output_pdb_path: str, output_pdb_ligand_path: str) -> None:
+    """ Extract ligand & protein from the PDB file
+
+    Args:
+        input_pdb_path (str): The path to the input pdb file
+        output_pdb_path (str): The path to the output pdb file
+        output_pdb_ligand_path (str): The path to the output pdb ligand file
+    """
+
+    # Load the PDB file
+    u = mda.Universe(input_pdb_path)
+
+    # Get unique residue names
+    protein_atoms = u.select_atoms('protein')  # use simple atom selection when possible
+
+    # Create a new Universe with only protein atoms
+    protein_u = mda.Universe.empty(n_atoms=protein_atoms.n_atoms, trajectory=True)  # needed for coordinates
+    protein_u.atoms = protein_atoms
+
+    # duplicate the universe object
+    dup_u = mda.Universe(input_pdb_path)
+
+    # now do the same for the ligand, not protein and not water or salts
+    ligand_atoms = u.select_atoms('not protein')
+
+    try:
+        # guess the bonds, since input PDB may not have bonds
+        dup_u.atoms.guess_bonds()
+    except ValueError:
+        # ValueError: vdw radii for types: AS. These can be defined manually using the keyword 'vdwradii'
+        print('Error: Could not guess bonds. Check the input PDB file.')
+
+    has_bonds = False
+    try:
+        num_bonds = len(dup_u.atoms.bonds)
+        has_bonds = True
+    except mda.exceptions.NoDataError:
+        print('No bonds found in the PDB file.')
+
+    # Identify water molecules based on the connectivity pattern (Oxygen bonded to two Hydrogens)
+    if has_bonds:
+        water_indices = set()
+        for atom in dup_u.atoms:  # dont use selection resname == 'HOH', pdb file may have different water residue names
+            if atom.name == 'O' and len(atom.bonds) == 2:  # if hydrogens are added
+                bonded_atoms_names = set([a.name for a in atom.bonded_atoms])
+                if bonded_atoms_names == {'H'}:  # Check if both bonds are Hydrogens
+                    water_indices.add(atom.index)
+                    water_indices.update([a.index for a in atom.bonded_atoms])
+
+        # now want to remove all salts, waters without H
+        non_bonded = set()
+        for atom in dup_u.atoms:
+            if len(atom.bonds) == 0:
+                non_bonded.add(atom.index)
+
+        # Remove water by excluding the water indices
+        if len(water_indices) > 0:
+            water_indices_string = ' '.join([str(i) for i in water_indices])
+            ligand_atoms = ligand_atoms.select_atoms(f'not index {water_indices_string}')
+
+        # Remove non bonded atoms
+        if len(non_bonded) > 0:
+            non_bonded_string = ' '.join([str(i) for i in non_bonded])
+            ligand_atoms = ligand_atoms.select_atoms(f'not index {non_bonded_string}')
+
+    ligand_u = mda.Universe.empty(n_atoms=ligand_atoms.n_atoms, trajectory=True)  # needed for coordinates
+    ligand_u.atoms = ligand_atoms
+
+    with open(output_pdb_path, mode="w", encoding='utf-8') as wfile:
+        protein_u.atoms.write(output_pdb_path)
+    if len(ligand_u.atoms) > 0:  # will crash if no ligand atoms
+        with open(output_pdb_ligand_path, mode="w", encoding='utf-8') as wfile:
+            ligand_u.atoms.write(output_pdb_ligand_path)
+
+
+def main() -> None:
+    """ Reads the command line arguments and extract protein from the PDB file
+    """
+    args = parse_arguments()
+
+    if not os.path.exists(args.input_pdb_path):
+        print(f'Error: Can not find file {args.input_pdb_path}')
+        sys.exit(1)
+
+    extract_ligand_protein(args.input_pdb_path, args.output_pdb_path, args.output_pdb_ligand_path)
+
+
+if __name__ == '__main__':
+    main()