Merge pull request #2380 from ncfrey/gnina-support (6956e1d0) · Commits · 钟慕尧 / deepchem

deepchem/dock/init.py

+1 −0

Original line number	Diff line number	Diff line
		# flake8: noqa
		from deepchem.dock.pose_generation import PoseGenerator
		from deepchem.dock.pose_generation import VinaPoseGenerator
		from deepchem.dock.pose_generation import GninaPoseGenerator
		from deepchem.dock.docking import Docker
		from deepchem.dock.binding_pocket import ConvexHullPocketFinder

deepchem/dock/pose_generation.py

+169 −7

Original line number	Diff line number	Diff line
		@@ -7,7 +7,7 @@ import os
		import tempfile
		import tarfile
		import numpy as np
		from subprocess import call
		from subprocess import call, Popen, PIPE
		from subprocess import check_output
		from typing import List, Optional, Tuple, Union

		@@ -16,7 +16,7 @@ from deepchem.utils.data_utils import download_url, get_data_dir
		from deepchem.utils.typing import RDKitMol
		from deepchem.utils.geometry_utils import compute_centroid, compute_protein_range
		from deepchem.utils.rdkit_utils import load_molecule, write_molecule
		from deepchem.utils.vina_utils import load_docked_ligands, write_vina_conf
		from deepchem.utils.docking_utils import load_docked_ligands, write_vina_conf, write_gnina_conf, read_gnina_log

		logger = logging.getLogger(__name__)
		DOCKED_POSES = List[Tuple[RDKitMol, RDKitMol]]
		@@ -53,8 +53,8 @@ class PoseGenerator(object):
		centroid: np.ndarray, optional (default None)
		The centroid to dock against. Is computed if not specified.
		box_dims: np.ndarray, optional (default None)
		A numpy array of shape `(3,)` holding the size of the box to dock. If not
		specified is set to size of molecular complex plus 5 angstroms.
		A numpy array of shape `(3,)` holding the size of the box to dock.
		If not specified is set to size of molecular complex plus 5 angstroms.
		exhaustiveness: int, optional (default 10)
		Tells pose generator how exhaustive it should be with pose
		generation.
		@@ -79,6 +79,167 @@ class PoseGenerator(object):
		raise NotImplementedError


		class GninaPoseGenerator(PoseGenerator):
		"""Use GNINA to generate binding poses.

		This class uses GNINA (a deep learning framework for molecular
		docking) to generate binding poses. It downloads the GNINA
		executable to DEEPCHEM_DATA_DIR (an environment variable you set)
		and invokes the executable to perform pose generation.

		GNINA uses pre-trained convolutional neural network (CNN) scoring
		functions to rank binding poses based on learned representations of
		3D protein-ligand interactions. It has been shown to outperform
		AutoDock Vina in virtual screening applications [1]_.

		If you use the GNINA molecular docking engine, please cite the relevant
		papers: https://github.com/gnina/gnina#citation
		The primary citation for GNINA is [1]_.

		References
		----------
		.. [1] M Ragoza, J Hochuli, E Idrobo, J Sunseri, DR Koes.
		"Protein–Ligand Scoring with Convolutional Neural Networks."
		Journal of chemical information and modeling (2017).

		Note
		----
		* GNINA currently only works on Linux operating systems.
		* GNINA requires CUDA >= 10.1 for fast CNN scoring.
		* Almost all dependencies are included in the most compatible way
		possible, which reduces performance. Build GNINA from source
		for production use.

		"""

		def __init__(self):
		"""Initialize GNINA pose generator."""

		data_dir = get_data_dir()
		if platform.system() == 'Linux':
		url = "https://github.com/gnina/gnina/releases/download/v1.0/gnina"
		filename = 'gnina'
		self.gnina_dir = data_dir
		self.gnina_cmd = os.path.join(self.gnina_dir, filename)
		else:
		raise ValueError(
		"GNINA currently only runs on Linux. Try using a cloud platform to run this code instead."
		)

		if not os.path.exists(self.gnina_cmd):
		logger.info("GNINA not available. Downloading...")
		download_url(url, data_dir)
		downloaded_file = os.path.join(data_dir, filename)
		os.chmod(downloaded_file, 755)
		logger.info("Downloaded GNINA.")

		def generate_poses(
		self,
		molecular_complex: Tuple[str, str],
		centroid: Optional[np.ndarray] = None,
		box_dims: Optional[np.ndarray] = None,
		exhaustiveness: int = 10,
		num_modes: int = 9,
		num_pockets: Optional[int] = None,
		out_dir: Optional[str] = None,
		generate_scores: bool = True,
		**kwargs) -> Union[Tuple[DOCKED_POSES, List[float]], DOCKED_POSES]:
		"""Generates the docked complex and outputs files for docked complex.

		Parameters
		----------
		molecular_complexes: Tuple[str, str]
		A representation of a molecular complex. This tuple is
		(protein_file, ligand_file).
		centroid: np.ndarray, optional (default None)
		The centroid to dock against. Is computed if not specified.
		box_dims: np.ndarray, optional (default None)
		A numpy array of shape `(3,)` holding the size of the box to dock.
		If not specified is set to size of molecular complex plus 4 angstroms.
		exhaustiveness: int (default 8)
		Tells GNINA how exhaustive it should be with pose
		generation.
		num_modes: int (default 9)
		Tells GNINA how many binding modes it should generate at
		each invocation.
		out_dir: str, optional
		If specified, write generated poses to this directory.
		generate_scores: bool, optional (default True)
		If `True`, the pose generator will return scores for complexes.
		This is used typically when invoking external docking programs
		that compute scores.
		kwargs:
		Any args supported by GNINA as documented
		https://github.com/gnina/gnina#usage

		Returns
		-------
		Tuple[`docked_poses`, `scores`] or `docked_poses`
		Tuple of `(docked_poses, scores)` or `docked_poses`. `docked_poses`
		is a list of docked molecular complexes. Each entry in this list
		contains a `(protein_mol, ligand_mol)` pair of RDKit molecules.
		`scores` is an array of binding affinities (kcal/mol),
		CNN pose scores, and CNN affinities predicted by GNINA.

		"""

		if out_dir is None:
		out_dir = tempfile.mkdtemp()
		if not os.path.exists(out_dir):
		os.makedirs(out_dir)

		# Parse complex
		if len(molecular_complex) > 2:
		raise ValueError(
		"GNINA can only dock protein-ligand complexes and not more general molecular complexes."
		)

		(protein_file, ligand_file) = molecular_complex

		# check filetypes
		if not protein_file.endswith('.pdb'):
		raise ValueError('Protein file must be in .pdb format.')
		if not ligand_file.endswith('.sdf'):
		raise ValueError('Ligand file must be in .sdf format.')

		protein_mol = load_molecule(
		protein_file, calc_charges=True, add_hydrogens=True)
		ligand_name = os.path.basename(ligand_file).split(".")[0]

		# Define locations of log and output files
		log_file = os.path.join(out_dir, "%s_log.txt" % ligand_name)
		out_file = os.path.join(out_dir, "%s_docked.pdbqt" % ligand_name)
		logger.info("About to call GNINA.")

		# Write GNINA conf file
		conf_file = os.path.join(out_dir, "conf.txt")
		write_gnina_conf(
		protein_filename=protein_file,
		ligand_filename=ligand_file,
		conf_filename=conf_file,
		num_modes=num_modes,
		exhaustiveness=exhaustiveness,
		**kwargs)

		# Run GNINA
		args = [
		self.gnina_cmd, "--config", conf_file, "--log", log_file, "--out",
		out_file
		]
		process = Popen(args, stdout=PIPE, stderr=PIPE)
		stdout, stderr = process.communicate()

		# read output and log
		ligands, _ = load_docked_ligands(out_file)
		docked_complexes = [(protein_mol[1], ligand) for ligand in ligands]
		scores = read_gnina_log(log_file)

		if generate_scores:
		return docked_complexes, scores
		else:
		return docked_complexes


		class VinaPoseGenerator(PoseGenerator):
		"""Uses Autodock Vina to generate binding poses.

		@@ -157,7 +318,7 @@ class VinaPoseGenerator(PoseGenerator):
		num_modes: int = 9,
		num_pockets: Optional[int] = None,
		out_dir: Optional[str] = None,
		generate_scores: bool = False
		generate_scores: Optional[bool] = False
		) -> Union[Tuple[DOCKED_POSES, List[float]], DOCKED_POSES]:
		"""Generates the docked complex and outputs files for docked complex.

		@@ -168,7 +329,8 @@ class VinaPoseGenerator(PoseGenerator):
		----------
		molecular_complexes: Tuple[str, str]
		A representation of a molecular complex. This tuple is
		(protein_file, ligand_file).
		(protein_file, ligand_file). The protein should be a pdb file
		and the ligand should be an sdf file.
		centroid: np.ndarray, optional
		The centroid to dock against. Is computed if not specified.
		box_dims: np.ndarray, optional
		@@ -263,7 +425,7 @@ class VinaPoseGenerator(PoseGenerator):
		centroids = centroids[:num_pockets]
		dimensions = dimensions[:num_pockets]

		# Prepare protein
		# Prepare ligand
		ligand_name = os.path.basename(ligand_file).split(".")[0]
		ligand_pdbqt = os.path.join(out_dir, "%s.pdbqt" % ligand_name)

deepchem/dock/tests/test_pose_generation.py

+35 −0

Original line number	Diff line number	Diff line
		@@ -11,6 +11,7 @@ import deepchem as dc
		import pytest

		IS_WINDOWS = platform.system() == 'Windows'
		IS_LINUX = platform.system() == 'Linux'


		class TestPoseGeneration(unittest.TestCase):
		@@ -23,6 +24,11 @@ class TestPoseGeneration(unittest.TestCase):
		"""Test that VinaPoseGenerator can be initialized."""
		dc.dock.VinaPoseGenerator()

		@unittest.skipIf(not IS_LINUX, 'Skip the test on Windows and Mac.')
		def test_gnina_initialization(self):
		"""Test that GninaPoseGenerator can be initialized."""
		dc.dock.GninaPoseGenerator()

		@unittest.skipIf(IS_WINDOWS, 'Skip the test on Windows')
		def test_pocket_vina_initialization(self):
		"""Test that VinaPoseGenerator can be initialized."""
		@@ -58,6 +64,35 @@ class TestPoseGeneration(unittest.TestCase):
		assert isinstance(protein, Chem.Mol)
		assert isinstance(ligand, Chem.Mol)

		@pytest.mark.slow
		@unittest.skipIf(not IS_LINUX, 'Skip the test on Windows and Mac.')
		def test_gnina_poses_and_scores(self):
		"""Test that GninaPoseGenerator generates poses and scores

		This test takes some time to run, about 3 minutes on
		development laptop.
		"""
		# Let's turn on logging since this test will run for a while
		logging.basicConfig(level=logging.INFO)
		current_dir = os.path.dirname(os.path.realpath(__file__))
		protein_file = os.path.join(current_dir, "1jld_protein.pdb")
		ligand_file = os.path.join(current_dir, "1jld_ligand.sdf")

		gpg = dc.dock.GninaPoseGenerator()
		with tempfile.TemporaryDirectory() as tmp:
		poses, scores = gpg.generate_poses(
		(protein_file, ligand_file),
		exhaustiveness=1,
		num_modes=1,
		out_dir=tmp)

		assert len(poses) == 1
		assert len(scores) == 1
		protein, ligand = poses[0]
		from rdkit import Chem
		assert isinstance(protein, Chem.Mol)
		assert isinstance(ligand, Chem.Mol)

		@pytest.mark.slow
		def test_vina_poses_no_scores(self):
		"""Test that VinaPoseGenerator generates poses.

deepchem/utils/init.py

+5 −3

Original line number	Diff line number	Diff line
		@@ -83,9 +83,11 @@ from deepchem.utils.pdbqt_utils import pdbqt_to_pdb
		from deepchem.utils.pdbqt_utils import convert_protein_to_pdbqt
		from deepchem.utils.pdbqt_utils import convert_mol_to_pdbqt

		from deepchem.utils.vina_utils import write_vina_conf
		from deepchem.utils.vina_utils import load_docked_ligands
		from deepchem.utils.vina_utils import prepare_inputs
		from deepchem.utils.docking_utils import write_vina_conf
		from deepchem.utils.docking_utils import write_gnina_conf
		from deepchem.utils.docking_utils import read_gnina_log
		from deepchem.utils.docking_utils import load_docked_ligands
		from deepchem.utils.docking_utils import prepare_inputs

		from deepchem.utils.voxel_utils import convert_atom_to_voxel
		from deepchem.utils.voxel_utils import convert_atom_pair_to_voxel

deepchem/utils/docking_utils.py

0 → 100644

+308 −0

Original line number	Diff line number	Diff line
		"""
		This file contains utilities for molecular docking.
		"""
		from typing import List, Optional, Tuple

		import os
		import numpy as np
		from deepchem.utils.typing import RDKitMol
		from deepchem.utils.pdbqt_utils import pdbqt_to_pdb


		def write_vina_conf(protein_filename: str,
		ligand_filename: str,
		centroid: np.ndarray,
		box_dims: np.ndarray,
		conf_filename: str,
		num_modes: int = 9,
		exhaustiveness: int = None) -> None:
		"""Writes Vina configuration file to disk.

		Autodock Vina accepts a configuration file which provides options
		under which Vina is invoked. This utility function writes a vina
		configuration file which directs Autodock vina to perform docking
		under the provided options.

		Parameters
		----------
		protein_filename: str
		Filename for protein
		ligand_filename: str
		Filename for the ligand
		centroid: np.ndarray
		A numpy array with shape `(3,)` holding centroid of system
		box_dims: np.ndarray
		A numpy array of shape `(3,)` holding the size of the box to dock
		conf_filename: str
		Filename to write Autodock Vina configuration to.
		num_modes: int, optional (default 9)
		The number of binding modes Autodock Vina should find
		exhaustiveness: int, optional
		The exhaustiveness of the search to be performed by Vina
		"""
		with open(conf_filename, "w") as f:
		f.write("receptor = %s\n" % protein_filename)
		f.write("ligand = %s\n\n" % ligand_filename)

		f.write("center_x = %f\n" % centroid[0])
		f.write("center_y = %f\n" % centroid[1])
		f.write("center_z = %f\n\n" % centroid[2])

		f.write("size_x = %f\n" % box_dims[0])
		f.write("size_y = %f\n" % box_dims[1])
		f.write("size_z = %f\n\n" % box_dims[2])

		f.write("num_modes = %d\n\n" % num_modes)
		if exhaustiveness is not None:
		f.write("exhaustiveness = %d\n" % exhaustiveness)


		def write_gnina_conf(protein_filename: str,
		ligand_filename: str,
		conf_filename: str,
		num_modes: int = 9,
		exhaustiveness: int = None,
		**kwargs) -> None:
		"""Writes GNINA configuration file to disk.

		GNINA accepts a configuration file which provides options
		under which GNINA is invoked. This utility function writes a
		configuration file which directs GNINA to perform docking
		under the provided options.

		Parameters
		----------
		protein_filename: str
		Filename for protein
		ligand_filename: str
		Filename for the ligand
		conf_filename: str
		Filename to write Autodock Vina configuration to.
		num_modes: int, optional (default 9)
		The number of binding modes GNINA should find
		exhaustiveness: int, optional
		The exhaustiveness of the search to be performed by GNINA
		kwargs:
		Args supported by GNINA documented here
		https://github.com/gnina/gnina#usage

		"""

		with open(conf_filename, "w") as f:
		f.write("receptor = %s\n" % protein_filename)
		f.write("ligand = %s\n\n" % ligand_filename)

		f.write("autobox_ligand = %s\n\n" % protein_filename)

		if exhaustiveness is not None:
		f.write("exhaustiveness = %d\n" % exhaustiveness)
		f.write("num_modes = %d\n\n" % num_modes)

		for k, v in kwargs.items():
		f.write("%s = %s\n" % (str(k), str(v)))


		def read_gnina_log(log_file: str) -> np.array:
		"""Read GNINA logfile and get docking scores.

		GNINA writes computed binding affinities to a logfile.

		Parameters
		----------
		log_file: str
		Filename of logfile generated by GNINA.

		Returns
		-------
		scores: np.array, dimension (num_modes, 3)
		Array of binding affinity (kcal/mol), CNN pose score,
		and CNN affinity for each binding mode.

		"""

		scores = []
		lines = open(log_file).readlines()
		mode_start = np.inf
		for idx, line in enumerate(lines):
		if line[:6] == '-----+':
		mode_start = idx
		if idx > mode_start:
		mode = line.split()
		score = [float(x) for x in mode[1:]]
		scores.append(score)

		scores = np.array(scores)
		return scores


		def load_docked_ligands(
		pdbqt_output: str) -> Tuple[List[RDKitMol], List[float]]:
		"""This function loads ligands docked by autodock vina.

		Autodock vina writes outputs to disk in a PDBQT file format. This
		PDBQT file can contain multiple docked "poses". Recall that a pose
		is an energetically favorable 3D conformation of a molecule. This
		utility function reads and loads the structures for multiple poses
		from vina's output file.

		Parameters
		----------
		pdbqt_output: str
		Should be the filename of a file generated by autodock vina's
		docking software.

		Returns
		-------
		Tuple[List[rdkit.Chem.rdchem.Mol], List[float]]
		Tuple of `molecules, scores`. `molecules` is a list of rdkit
		molecules with 3D information. `scores` is the associated vina
		score.

		Notes
		-----
		This function requires RDKit to be installed.
		"""
		try:
		from rdkit import Chem
		except ModuleNotFoundError:
		raise ImportError("This function requires RDKit to be installed.")

		lines = open(pdbqt_output).readlines()
		molecule_pdbqts = []
		scores = []
		current_pdbqt: Optional[List[str]] = None
		for line in lines:
		if line[:5] == "MODEL":
		current_pdbqt = []
		elif line[:19] == "REMARK VINA RESULT:":
		words = line.split()
		# the line has format
		# REMARK VINA RESULT: score ...
		# There is only 1 such line per model so we can append it
		scores.append(float(words[3]))
		elif line[:6] == "ENDMDL":
		molecule_pdbqts.append(current_pdbqt)
		current_pdbqt = None
		else:
		# FIXME: Item "None" of "Optional[List[str]]" has no attribute "append"
		current_pdbqt.append(line) # type: ignore

		molecules = []
		for pdbqt_data in molecule_pdbqts:
		pdb_block = pdbqt_to_pdb(pdbqt_data=pdbqt_data)
		mol = Chem.MolFromPDBBlock(str(pdb_block), sanitize=False, removeHs=False)
		molecules.append(mol)
		return molecules, scores


		def prepare_inputs(protein: str,
		ligand: str,
		replace_nonstandard_residues: bool = True,
		remove_heterogens: bool = True,
		remove_water: bool = True,
		add_hydrogens: bool = True,
		pH: float = 7.0,
		optimize_ligand: bool = True,
		pdb_name: Optional[str] = None) -> Tuple[RDKitMol, RDKitMol]:
		"""This prepares protein-ligand complexes for docking.

		Autodock Vina requires PDB files for proteins and ligands with
		sensible inputs. This function uses PDBFixer and RDKit to ensure
		that inputs are reasonable and ready for docking. Default values
		are given for convenience, but fixing PDB files is complicated and
		human judgement is required to produce protein structures suitable
		for docking. Always inspect the results carefully before trying to
		perform docking.

		Parameters
		----------
		protein: str
		Filename for protein PDB file or a PDBID.
		ligand: str
		Either a filename for a ligand PDB file or a SMILES string.
		replace_nonstandard_residues: bool (default True)
		Replace nonstandard residues with standard residues.
		remove_heterogens: bool (default True)
		Removes residues that are not standard amino acids or nucleotides.
		remove_water: bool (default True)
		Remove water molecules.
		add_hydrogens: bool (default True)
		Add missing hydrogens at the protonation state given by `pH`.
		pH: float (default 7.0)
		Most common form of each residue at given `pH` value is used.
		optimize_ligand: bool (default True)
		If True, optimize ligand with RDKit. Required for SMILES inputs.
		pdb_name: Optional[str]
		If given, write sanitized protein and ligand to files called
		"pdb_name.pdb" and "ligand_pdb_name.pdb"

		Returns
		-------
		Tuple[RDKitMol, RDKitMol]
		Tuple of `protein_molecule, ligand_molecule` with 3D information.

		Note
		----
		This function requires RDKit and OpenMM to be installed.
		Read more about PDBFixer here: https://github.com/openmm/pdbfixer.

		Examples
		--------
		>>> p, m = prepare_inputs('3cyx', 'CCC')
		>>> p.GetNumAtoms()
		1415
		>>> m.GetNumAtoms()
		11

		>>> p, m = prepare_inputs('3cyx', 'CCC', remove_heterogens=False)
		>>> p.GetNumAtoms()
		1720

		"""

		try:
		from rdkit import Chem
		from rdkit.Chem import AllChem
		from pdbfixer import PDBFixer
		from simtk.openmm.app import PDBFile
		except ModuleNotFoundError:
		raise ImportError(
		"This function requires RDKit and OpenMM to be installed.")

		if protein.endswith('.pdb'):
		fixer = PDBFixer(protein)
		else:
		fixer = PDBFixer(url='https://files.rcsb.org/download/%s.pdb' % (protein))

		if ligand.endswith('.pdb'):
		m = Chem.MolFromPDBFile(ligand)
		else:
		m = Chem.MolFromSmiles(ligand, sanitize=True)

		# Apply common fixes to PDB files
		if replace_nonstandard_residues:
		fixer.findMissingResidues()
		fixer.findNonstandardResidues()
		fixer.replaceNonstandardResidues()
		if remove_heterogens and not remove_water:
		fixer.removeHeterogens(True)
		if remove_heterogens and remove_water:
		fixer.removeHeterogens(False)
		if add_hydrogens:
		fixer.addMissingHydrogens(pH)

		PDBFile.writeFile(fixer.topology, fixer.positions, open('tmp.pdb', 'w'))
		p = Chem.MolFromPDBFile('tmp.pdb', sanitize=True)
		os.remove('tmp.pdb')

		# Optimize ligand
		if optimize_ligand:
		m = Chem.AddHs(m) # need hydrogens for optimization
		AllChem.EmbedMolecule(m)
		AllChem.MMFFOptimizeMolecule(m)

		if pdb_name:
		Chem.rdmolfiles.MolToPDBFile(p, '%s.pdb' % (pdb_name))
		Chem.rdmolfiles.MolToPDBFile(m, 'ligand_%s.pdb' % (pdb_name))

		return (p, m)

Admin message