extract protein and ligand in same workflow step

Brandon Duane Walker · Brandon Duane Walker · commit a1de187bbe1b · 2024-03-06T10:07:41.000-05:00
diff --git a/.github/workflows/docker_build.yml b/.github/workflows/docker_build.yml
@@ -26,7 +26,7 @@ jobs:
                      rename_residues_mol, combine_structure,
                      remove_terminal_residue_name_prefixes, molgan,
                      pdbbind_refined, onionnet-sfct, smina, pdbfixer,
-                     fix_pdb_atom_column, extract_protein, generate_conformers]  # No username for pdbind_refined
+                     fix_pdb_atom_column, extract_ligand_protein, generate_conformers]  # No username for pdbind_refined
         # skip data/ and cwl_adapters/file_format_conversions/biosimspace/
     runs-on: [ubuntu-latest]
 
diff --git a/cwl_adapters/extract_ligand_protein.cwl b/cwl_adapters/extract_ligand_protein.cwl
@@ -3,16 +3,16 @@ cwlVersion: v1.0
 
 class: CommandLineTool
 
-label: A tool that employs OpenMM to extract protein from a PDB file
+label: A tool that employs OpenMM to extract ligands and protein from a PDB file
 
 doc: |-
-  A tool that employs OpenMM to extract protein from a PDB file
+  A tool that employs OpenMM to extract ligands and protein from a PDB file
 
-baseCommand: ['python', '/extract_protein.py']
+baseCommand: ['python', '/extract_ligand_protein.py']
 
 hints:
   DockerRequirement:
-    dockerPull: ndonyapour/extract_protein
+    dockerPull: mrbrandonwalker/extract_ligand_protein
 
 inputs:
   input_pdb_path:
@@ -44,6 +44,20 @@ inputs:
       prefix: --output_pdb_path
     default: system.pdb
 
+  output_pdb_ligand_path:
+    label: Output pdb ligand file path
+    doc: |-
+      Output pdb ligand file path
+      Type: string
+      File type: output
+      Accepted formats: sdf
+    type: string
+    format:
+    - edam:format_1476
+    inputBinding:
+      prefix: --output_pdb_ligand_path
+    default: ligand_system.pdb
+
 outputs:
   output_pdb_path:
     label: Output pdb file path
@@ -54,6 +68,16 @@ outputs:
       glob: $(inputs.output_pdb_path)
     format: edam:format_1476
 
+  output_pdb_ligand_path:
+    label: Output ligand pdb file path
+    doc: |-
+      Output ligand pdb file path
+      Use optional File? since ligand may not exist in complex
+    type: File?
+    outputBinding:
+      glob: $(inputs.output_pdb_ligand_path)
+    format: edam:format_1476
+
 $namespaces:
   edam: https://edamontology.org/
 
diff --git a/docker/dockerBuild.sh b/docker/dockerBuild.sh
@@ -33,7 +33,7 @@ sudo docker build --no-cache --pull -f Dockerfile_molgan -t ndonyapour/molgan .
 sudo docker build --no-cache --pull -f Dockerfile_onionnet-sfct -t cyangnyu/onionnet-sfct .
 sudo docker build --no-cache --pull -f Dockerfile_smina -t cyangnyu/smina .
 sudo docker build --no-cache --pull -f Dockerfile_pdbfixer -t ndonyapour/pdbfixer .
-sudo docker build --no-cache --pull -f Dockerfile_extract_protein -t ndonyapour/extract_protein .
+sudo docker build --no-cache --pull -f Dockerfile_extract_ligand_protein -t mrbrandonwalker/extract_ligand_protein .
 sudo docker build --no-cache --pull -f Dockerfile_fix_pdb_atom_column -t ndonyapour/fix_pdb_atom_column .
 sudo docker build --no-cache --pull -f Dockerfile_generate_conformers -t ndonyapour/generate_conformers .
 
diff --git a/dockerPull.sh b/dockerPull.sh
@@ -27,6 +27,6 @@ docker pull ndonyapour/combine_structure
 docker pull mrbrandonwalker/diffdock_gpu
 docker pull mrbrandonwalker/diffdock_cpu
 docker pull ndonyapour/pdbfixer
-docker pull ndonyapour/extract_protein
+docker pull mrbrandonwalker/extract_ligand_protein
 docker pull ndonyapour/fix_pdb_atom_column
 docker pull ndonyapour/generate_conformers
diff --git a/examples/docking/vs_demo_4.yml b/examples/docking/vs_demo_4.yml
@@ -14,7 +14,7 @@ steps:
       output_sdf_paths: '&pdbbind_sdfs'
       experimental_dGs: '&exp_dGs'
       pdb_ids: '&pdbids'
-      
+
 - config_tag_pdb:
     scatter: [pdb_id]
     in:
@@ -39,11 +39,12 @@ steps:
     scatter: [input_pdb_path, input_helper_pdb_path]
     scatterMethod: dotproduct
 
-- extract_protein:
+- extract_ligand_protein:
     scatter: [input_pdb_path]
     in:
       input_pdb_path: '*pdbbind_pdbfixer.pdb'
       output_pdb_path: '&protein.pdb'
+      output_pdb_ligand_path: '&ligand.pdb'
 
 # assign partial charges (ligand)
 - convert_pdbqt.yml:
@@ -160,7 +161,7 @@ wic:
         inlineable: False
         graphviz:
           label: Fix Protein Structure
-    (5, extract_protein):
+    (5, extract_ligand_protein):
       wic:
         inlineable: False
         graphviz:
diff --git a/examples/scripts/Dockerfile_extract_ligand_protein b/examples/scripts/Dockerfile_extract_ligand_protein
@@ -0,0 +1,7 @@
+FROM condaforge/mambaforge
+# NOT mambaforge-pypy3 (mdanalysis is incompatible with pypy)
+RUN mamba config --add channels conda-forge
+RUN mamba install mdanalysis
+
+ADD extract_ligand_protein.py .
+ADD Dockerfile_extract_ligand_protein .
diff --git a/examples/scripts/Dockerfile_extract_protein b/examples/scripts/Dockerfile_extract_protein
diff --git a/examples/scripts/extract_ligand_protein.py b/examples/scripts/extract_ligand_protein.py
@@ -0,0 +1,99 @@
+# pylint: disable=no-member
+import sys
+import os
+import argparse
+
+import MDAnalysis as mda
+
+
+def parse_arguments() -> argparse.Namespace:
+    """ This function parses the arguments.
+
+    Returns:
+        argparse.Namespace: The command line arguments
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_pdb_path', type=str)
+    parser.add_argument('--output_pdb_path', type=str)
+    parser.add_argument('--output_pdb_ligand_path', type=str)
+    args = parser.parse_args()
+    return args
+
+
+def extract_ligand_protein(input_pdb_path: str, output_pdb_path: str, output_pdb_ligand_path: str) -> None:
+    """ Extract ligand & protein from the PDB file
+
+    Args:
+        input_pdb_path (str): The path to the input pdb file
+        output_pdb_path (str): The path to the output pdb file
+        output_pdb_ligand_path (str): The path to the output pdb ligand file
+    """
+
+    # Load the PDB file
+    u = mda.Universe(input_pdb_path)
+
+    # Get unique residue names
+    protein_atoms = u.select_atoms('protein')  # use simple atom selection when possible
+
+    # Create a new Universe with only protein atoms
+    protein_u = mda.Universe.empty(n_atoms=protein_atoms.n_atoms, trajectory=True)  # needed for coordinates
+    protein_u.atoms = protein_atoms
+
+    # duplicate the universe object
+    dup_u = mda.Universe(input_pdb_path)
+
+    # guess the bonds, since input PDB may not have bonds
+    dup_u.atoms.guess_bonds()
+
+    # Identify water molecules based on the connectivity pattern (Oxygen bonded to two Hydrogens)
+    water_indices = set()
+    for atom in dup_u.atoms:  # dont use selection resname == 'HOH', pdb file may have different water residue names
+        if atom.name == 'O' and len(atom.bonds) == 2:  # if hydrogens are added
+            bonded_atoms_names = set([a.name for a in atom.bonded_atoms])
+            if bonded_atoms_names == {'H'}:  # Check if both bonds are Hydrogens
+                water_indices.add(atom.index)
+                water_indices.update([a.index for a in atom.bonded_atoms])
+
+    # now want to remove all salts, waters without H
+    non_bonded = set()
+    for atom in dup_u.atoms:
+        if len(atom.bonds) == 0:
+            non_bonded.add(atom.index)
+
+    # now do the same for the ligand, not protein and not water or salts
+    ligand_atoms = u.select_atoms('not protein')
+
+    # Remove water by excluding the water indices
+    if len(water_indices) > 0:
+        water_indices_string = ' '.join([str(i) for i in water_indices])
+        ligand_atoms = ligand_atoms.select_atoms(f'not index {water_indices_string}')
+
+    # Remove non bonded atoms
+    if len(non_bonded) > 0:
+        non_bonded_string = ' '.join([str(i) for i in non_bonded])
+        ligand_atoms = ligand_atoms.select_atoms(f'not index {non_bonded_string}')
+
+    ligand_u = mda.Universe.empty(n_atoms=ligand_atoms.n_atoms, trajectory=True)  # needed for coordinates
+    ligand_u.atoms = ligand_atoms
+
+    with open(output_pdb_path, mode="w", encoding='utf-8') as wfile:
+        protein_u.atoms.write(output_pdb_path)
+    if len(ligand_u.atoms) > 0:  # will crash if no ligand atoms
+        with open(output_pdb_ligand_path, mode="w", encoding='utf-8') as wfile:
+            ligand_u.atoms.write(output_pdb_ligand_path)
+
+
+def main() -> None:
+    """ Reads the command line arguments and extract protein from the PDB file
+    """
+    args = parse_arguments()
+
+    if not os.path.exists(args.input_pdb_path):
+        print(f'Error: Can not find file {args.input_pdb_path}')
+        sys.exit(1)
+
+    extract_ligand_protein(args.input_pdb_path, args.output_pdb_path, args.output_pdb_ligand_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/scripts/extract_protein.py b/examples/scripts/extract_protein.py