From a3a8a3415b62a02162a68935c8257c524cbdf041 Mon Sep 17 00:00:00 2001 From: Brandon Duane Walker Date: Wed, 28 Feb 2024 14:28:37 -0500 Subject: [PATCH] extract protein and ligand in same workflow step --- .github/workflows/docker_build.yml | 2 +- cwl_adapters/extract_ligand_protein.cwl | 85 ++++++++++++++ cwl_adapters/extract_protein.cwl | 2 +- docker/dockerBuild.sh | 1 + dockerPull.sh | 1 + .../scripts/Dockerfile_extract_ligand_protein | 7 ++ examples/scripts/extract_ligand_protein.py | 111 ++++++++++++++++++ 7 files changed, 207 insertions(+), 2 deletions(-) create mode 100644 cwl_adapters/extract_ligand_protein.cwl create mode 100644 examples/scripts/Dockerfile_extract_ligand_protein create mode 100644 examples/scripts/extract_ligand_protein.py diff --git a/.github/workflows/docker_build.yml b/.github/workflows/docker_build.yml index 5b6f4de..178f886 100644 --- a/.github/workflows/docker_build.yml +++ b/.github/workflows/docker_build.yml @@ -26,7 +26,7 @@ jobs: rename_residues_mol, combine_structure, remove_terminal_residue_name_prefixes, molgan, pdbbind_refined, onionnet-sfct, smina, pdbfixer, - fix_pdb_atom_column, extract_protein, generate_conformers] # No username for pdbind_refined + fix_pdb_atom_column, extract_protein, extract_ligand_protein, generate_conformers] # No username for pdbind_refined # skip data/ and cwl_adapters/file_format_conversions/biosimspace/ runs-on: [ubuntu-latest] diff --git a/cwl_adapters/extract_ligand_protein.cwl b/cwl_adapters/extract_ligand_protein.cwl new file mode 100644 index 0000000..58423fb --- /dev/null +++ b/cwl_adapters/extract_ligand_protein.cwl @@ -0,0 +1,85 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.0 + +class: CommandLineTool + +label: A tool that employs OpenMM to extract ligands and protein from a PDB file + +doc: |- + A tool that employs OpenMM to extract ligands and protein from a PDB file + +baseCommand: ['python', '/extract_ligand_protein.py'] + +hints: + DockerRequirement: + dockerPull: mrbrandonwalker/extract_ligand_protein + +inputs: + input_pdb_path: + label: Input pdb file path + doc: |- + Input pdb file path + Type: string + File type: input + Accepted formats: pdb + Example file: https://github.com/bioexcel/biobb_structure_utils/raw/master/biobb_structure_utils/test/data/utils/cat_protein.pdb + type: File + format: + - edam:format_1476 + inputBinding: + prefix: --input_pdb_path + + output_pdb_path: + label: Output pdb file path + doc: |- + Output pdb file path + Type: string + File type: output + Accepted formats: pdb + Example file: https://github.com/bioexcel/biobb_structure_utils/raw/master/biobb_structure_utils/test/reference/utils/ref_cat_pdb.pdb + type: string + format: + - edam:format_1476 + inputBinding: + prefix: --output_pdb_path + default: system.pdb + + output_pdb_ligand_path: + label: Output pdb ligand file path + doc: |- + Output pdb ligand file path + Type: string + File type: output + Accepted formats: sdf + type: string + format: + - edam:format_1476 + inputBinding: + prefix: --output_pdb_ligand_path + default: ligand_system.pdb + +outputs: + output_pdb_path: + label: Output pdb file path + doc: |- + Output pdb file path + type: File + outputBinding: + glob: $(inputs.output_pdb_path) + format: edam:format_1476 + + output_pdb_ligand_path: + label: Output ligand pdb file path + doc: |- + Output ligand pdb file path + Use optional File? since ligand may not exist in complex + type: File? + outputBinding: + glob: $(inputs.output_pdb_ligand_path) + format: edam:format_1476 + +$namespaces: + edam: https://edamontology.org/ + +$schemas: +- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl diff --git a/cwl_adapters/extract_protein.cwl b/cwl_adapters/extract_protein.cwl index 9103d9c..50f722a 100644 --- a/cwl_adapters/extract_protein.cwl +++ b/cwl_adapters/extract_protein.cwl @@ -58,4 +58,4 @@ $namespaces: edam: https://edamontology.org/ $schemas: -- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl +- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl \ No newline at end of file diff --git a/docker/dockerBuild.sh b/docker/dockerBuild.sh index 4f3411a..ca33b52 100755 --- a/docker/dockerBuild.sh +++ b/docker/dockerBuild.sh @@ -34,6 +34,7 @@ sudo docker build --no-cache --pull -f Dockerfile_onionnet-sfct -t polusai/onion sudo docker build --no-cache --pull -f Dockerfile_smina -t cyangnyu/smina . sudo docker build --no-cache --pull -f Dockerfile_pdbfixer -t ndonyapour/pdbfixer . sudo docker build --no-cache --pull -f Dockerfile_extract_protein -t ndonyapour/extract_protein . +sudo docker build --no-cache --pull -f Dockerfile_extract_ligand_protein -t mrbrandonwalker/extract_ligand_protein . sudo docker build --no-cache --pull -f Dockerfile_fix_pdb_atom_column -t ndonyapour/fix_pdb_atom_column . sudo docker build --no-cache --pull -f Dockerfile_generate_conformers -t ndonyapour/generate_conformers . diff --git a/dockerPull.sh b/dockerPull.sh index f173c7b..3419792 100755 --- a/dockerPull.sh +++ b/dockerPull.sh @@ -28,5 +28,6 @@ docker pull mrbrandonwalker/diffdock_gpu docker pull mrbrandonwalker/diffdock_cpu docker pull ndonyapour/pdbfixer docker pull ndonyapour/extract_protein +docker pull mrbrandonwalker/extract_ligand_protein docker pull ndonyapour/fix_pdb_atom_column docker pull ndonyapour/generate_conformers \ No newline at end of file diff --git a/examples/scripts/Dockerfile_extract_ligand_protein b/examples/scripts/Dockerfile_extract_ligand_protein new file mode 100644 index 0000000..2454a30 --- /dev/null +++ b/examples/scripts/Dockerfile_extract_ligand_protein @@ -0,0 +1,7 @@ +# docker build -f Dockerfile_extract_ligand_protein -t mrbrandonwalker/extract_ligand_protein . +FROM condaforge/mambaforge +# NOT mambaforge-pypy3 (mdanalysis is incompatible with pypy) +RUN mamba install mdanalysis + +ADD extract_ligand_protein.py . +ADD Dockerfile_extract_ligand_protein . \ No newline at end of file diff --git a/examples/scripts/extract_ligand_protein.py b/examples/scripts/extract_ligand_protein.py new file mode 100644 index 0000000..8670736 --- /dev/null +++ b/examples/scripts/extract_ligand_protein.py @@ -0,0 +1,111 @@ +# pylint: disable=no-member +import sys +import os +import argparse + +import MDAnalysis as mda + + +def parse_arguments() -> argparse.Namespace: + """ This function parses the arguments. + + Returns: + argparse.Namespace: The command line arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument('--input_pdb_path', type=str) + parser.add_argument('--output_pdb_path', type=str) + parser.add_argument('--output_pdb_ligand_path', type=str) + args = parser.parse_args() + return args + + +def extract_ligand_protein(input_pdb_path: str, output_pdb_path: str, output_pdb_ligand_path: str) -> None: + """ Extract ligand & protein from the PDB file + + Args: + input_pdb_path (str): The path to the input pdb file + output_pdb_path (str): The path to the output pdb file + output_pdb_ligand_path (str): The path to the output pdb ligand file + """ + + # Load the PDB file + u = mda.Universe(input_pdb_path) + + # Get unique residue names + protein_atoms = u.select_atoms('protein') # use simple atom selection when possible + + # Create a new Universe with only protein atoms + protein_u = mda.Universe.empty(n_atoms=protein_atoms.n_atoms, trajectory=True) # needed for coordinates + protein_u.atoms = protein_atoms + + # duplicate the universe object + dup_u = mda.Universe(input_pdb_path) + + # now do the same for the ligand, not protein and not water or salts + ligand_atoms = u.select_atoms('not protein') + + try: + # guess the bonds, since input PDB may not have bonds + dup_u.atoms.guess_bonds() + except ValueError: + # ValueError: vdw radii for types: AS. These can be defined manually using the keyword 'vdwradii' + print('Error: Could not guess bonds. Check the input PDB file.') + + has_bonds = False + try: + num_bonds = len(dup_u.atoms.bonds) + has_bonds = True + except mda.exceptions.NoDataError: + print('No bonds found in the PDB file.') + + # Identify water molecules based on the connectivity pattern (Oxygen bonded to two Hydrogens) + if has_bonds: + water_indices = set() + for atom in dup_u.atoms: # dont use selection resname == 'HOH', pdb file may have different water residue names + if atom.name == 'O' and len(atom.bonds) == 2: # if hydrogens are added + bonded_atoms_names = set([a.name for a in atom.bonded_atoms]) + if bonded_atoms_names == {'H'}: # Check if both bonds are Hydrogens + water_indices.add(atom.index) + water_indices.update([a.index for a in atom.bonded_atoms]) + + # now want to remove all salts, waters without H + non_bonded = set() + for atom in dup_u.atoms: + if len(atom.bonds) == 0: + non_bonded.add(atom.index) + + # Remove water by excluding the water indices + if len(water_indices) > 0: + water_indices_string = ' '.join([str(i) for i in water_indices]) + ligand_atoms = ligand_atoms.select_atoms(f'not index {water_indices_string}') + + # Remove non bonded atoms + if len(non_bonded) > 0: + non_bonded_string = ' '.join([str(i) for i in non_bonded]) + ligand_atoms = ligand_atoms.select_atoms(f'not index {non_bonded_string}') + + ligand_u = mda.Universe.empty(n_atoms=ligand_atoms.n_atoms, trajectory=True) # needed for coordinates + ligand_u.atoms = ligand_atoms + + with open(output_pdb_path, mode="w", encoding='utf-8') as wfile: + protein_u.atoms.write(output_pdb_path) + if len(ligand_u.atoms) > 0: # will crash if no ligand atoms + with open(output_pdb_ligand_path, mode="w", encoding='utf-8') as wfile: + ligand_u.atoms.write(output_pdb_ligand_path) + + +def main() -> None: + """ Reads the command line arguments and extract protein from the PDB file + """ + args = parse_arguments() + + if not os.path.exists(args.input_pdb_path): + print(f'Error: Can not find file {args.input_pdb_path}') + sys.exit(1) + + extract_ligand_protein(args.input_pdb_path, args.output_pdb_path, args.output_pdb_ligand_path) + + +if __name__ == '__main__': + main()