Commit 58f0e7f6 authored by Nathan Frey's avatar Nathan Frey
Browse files

add gnina backend

parent b42d4f27
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
# flake8: noqa
from deepchem.dock.pose_generation import PoseGenerator
from deepchem.dock.pose_generation import VinaPoseGenerator
from deepchem.dock.pose_generation import GninaPoseGenerator
from deepchem.dock.docking import Docker
from deepchem.dock.binding_pocket import ConvexHullPocketFinder
+169 −7
Original line number Diff line number Diff line
@@ -7,7 +7,7 @@ import os
import tempfile
import tarfile
import numpy as np
from subprocess import call
from subprocess import call, Popen, PIPE
from subprocess import check_output
from typing import List, Optional, Tuple, Union

@@ -16,7 +16,7 @@ from deepchem.utils.data_utils import download_url, get_data_dir
from deepchem.utils.typing import RDKitMol
from deepchem.utils.geometry_utils import compute_centroid, compute_protein_range
from deepchem.utils.rdkit_utils import load_molecule, write_molecule
from deepchem.utils.vina_utils import load_docked_ligands, write_vina_conf
from deepchem.utils.vina_utils import load_docked_ligands, write_vina_conf, write_gnina_conf, read_gnina_log

logger = logging.getLogger(__name__)
DOCKED_POSES = List[Tuple[RDKitMol, RDKitMol]]
@@ -53,8 +53,8 @@ class PoseGenerator(object):
    centroid: np.ndarray, optional (default None)
      The centroid to dock against. Is computed if not specified.
    box_dims: np.ndarray, optional (default None)
      A numpy array of shape `(3,)` holding the size of the box to dock. If not
      specified is set to size of molecular complex plus 5 angstroms.
      A numpy array of shape `(3,)` holding the size of the box to dock.
      If not specified is set to size of molecular complex plus 5 angstroms.
    exhaustiveness: int, optional (default 10)
      Tells pose generator how exhaustive it should be with pose
      generation.
@@ -79,6 +79,167 @@ class PoseGenerator(object):
    raise NotImplementedError


class GninaPoseGenerator(PoseGenerator):
  """Use GNINA to generate binding poses.

  This class uses GNINA (a deep learning framework for molecular
  docking) to generate binding poses. It downloads the GNINA
  executable to DEEPCHEM_DATA_DIR (an environment variable you set)
  and invokes the executable to perform pose generation.

  GNINA uses pre-trained convolutional neural network (CNN) scoring
  functions to rank binding poses based on learned representations of
  3D protein-ligand interactions. It has been shown to outperform
  AutoDock Vina in virtual screening applications [1]_.

  If you use the GNINA molecular docking engine, please cite the relevant
  papers: https://github.com/gnina/gnina#citation
  The primary citation for GNINA is [1]_.

  References
  ----------
  .. [1] M Ragoza, J Hochuli, E Idrobo, J Sunseri, DR Koes.
  "Protein–Ligand Scoring with Convolutional Neural Networks."
  Journal of chemical information and modeling (2017).

  Notes
  -----
  * GNINA requires CUDA >= 10.1 for fast CNN scoring.
  * Almost all dependencies are included in the most compatible way
    possible, which reduces performance. Build GNINA from source
    for production use.

  """

  def __init__(self):
    """Initialize GNINA pose generator."""

    data_dir = get_data_dir()
    if platform.system() == 'Linux':
      url = "https://github.com/gnina/gnina/releases/download/v1.0/gnina"
      filename = 'gnina'
      self.gnina_dir = data_dir
      self.gnina_cmd = os.path.join(self.gnina_dir, filename)
    else:
      raise ValueError(
          "Unknown operating system. Try using a cloud platform to run this code instead."
      )

    if not os.path.exists(self.gnina_cmd):
      logger.info("GNINA not available. Downloading...")
      download_url(url, data_dir)
      downloaded_file = os.path.join(data_dir, filename)
      os.chmod(downloaded_file, 755)
      logger.info("Downloaded GNINA.")

  def generate_poses(
      self,
      molecular_complex: Tuple[str, str],
      centroid: Optional[np.ndarray] = None,
      box_dims: Optional[np.ndarray] = None,
      exhaustiveness: int = 10,
      num_modes: int = 9,
      num_pockets: Optional[int] = None,
      out_dir: Optional[str] = None,
      generate_scores: bool = True,
      **kwargs) -> Union[Tuple[DOCKED_POSES, List[float]], DOCKED_POSES]:
    """Generates the docked complex and outputs files for docked complex.

    Parameters
    ----------
    molecular_complexes: Tuple[str, str]
      A representation of a molecular complex. This tuple is
      (protein_file, ligand_file).
    centroid: np.ndarray, optional (default None)
      The centroid to dock against. Is computed if not specified.
    box_dims: np.ndarray, optional (default None)
      A numpy array of shape `(3,)` holding the size of the box to dock.
      If not specified is set to size of molecular complex plus 4 angstroms.
    exhaustiveness: int (default 8)
      Tells GNINA how exhaustive it should be with pose
      generation.
    num_modes: int (default 9)
      Tells GNINA how many binding modes it should generate at
      each invocation.
    out_dir: str, optional
      If specified, write generated poses to this directory.
    generate_scores: bool, optional (default True)
      If `True`, the pose generator will return scores for complexes.
      This is used typically when invoking external docking programs
      that compute scores.
    kwargs:
      Any args supported by GNINA as documented
      https://github.com/gnina/gnina#usage

    Returns
    -------
    Tuple[`docked_poses`, `scores`] or `docked_poses`
      Tuple of `(docked_poses, scores)` or `docked_poses`. `docked_poses`
      is a list of docked molecular complexes. Each entry in this list
      contains a `(protein_mol, ligand_mol)` pair of RDKit molecules.
      `scores` is an array of binding affinities (kcal/mol),
      CNN pose scores, and CNN affinities predicted by GNINA.

    """

    if out_dir is None:
      out_dir = tempfile.mkdtemp()
    if not os.path.exists(out_dir):
      os.makedirs(out_dir)

    # Parse complex
    if len(molecular_complex) > 2:
      raise ValueError(
          "GNINA can only dock protein-ligand complexes and not more general molecular complexes."
      )

    (protein_file, ligand_file) = molecular_complex

    # check filetypes
    if not protein_file.endswith('.pdb'):
      raise ValueError('Protein file must be in .pdb format.')
    if not ligand_file.endswith('.sdf'):
      raise ValueError('Ligand file must be in .sdf format.')

    protein_mol = load_molecule(
        protein_file, calc_charges=True, add_hydrogens=True)
    ligand_name = os.path.basename(ligand_file).split(".")[0]

    # Define locations of log and output files
    log_file = os.path.join(out_dir, "%s_log.txt" % ligand_name)
    out_file = os.path.join(out_dir, "%s_docked.pdbqt" % ligand_name)
    logger.info("About to call GNINA.")

    # Write GNINA conf file
    conf_file = os.path.join(out_dir, "conf.txt")
    write_gnina_conf(
        protein_filename=protein_file,
        ligand_filename=ligand_file,
        conf_filename=conf_file,
        num_modes=num_modes,
        exhaustiveness=exhaustiveness,
        **kwargs)

    # Run GNINA
    args = [
        self.gnina_cmd, "--config", conf_file, "--log", log_file, "--out",
        out_file
    ]
    process = Popen(args, stdout=PIPE, stderr=PIPE)
    stdout, stderr = process.communicate()
    stdout_decoded = stdout.decode()

    # read output and log
    ligands, _ = load_docked_ligands(out_file)
    docked_complexes = [(protein_mol[1], ligand) for ligand in ligands]
    scores = read_gnina_log(log_file)

    if generate_scores:
      return docked_complexes, scores
    else:
      return docked_complexes


class VinaPoseGenerator(PoseGenerator):
  """Uses Autodock Vina to generate binding poses.

@@ -157,7 +318,7 @@ class VinaPoseGenerator(PoseGenerator):
                     num_modes: int = 9,
                     num_pockets: Optional[int] = None,
                     out_dir: Optional[str] = None,
                     generate_scores: bool = False
                     generate_scores: Optional[bool] = False
                    ) -> Union[Tuple[DOCKED_POSES, List[float]], DOCKED_POSES]:
    """Generates the docked complex and outputs files for docked complex.

@@ -168,7 +329,8 @@ class VinaPoseGenerator(PoseGenerator):
    ----------
    molecular_complexes: Tuple[str, str]
      A representation of a molecular complex. This tuple is
      (protein_file, ligand_file).
      (protein_file, ligand_file). The protein should be a pdb file
      and the ligand should be an sdf file.
    centroid: np.ndarray, optional
      The centroid to dock against. Is computed if not specified.
    box_dims: np.ndarray, optional
@@ -263,7 +425,7 @@ class VinaPoseGenerator(PoseGenerator):
      centroids = centroids[:num_pockets]
      dimensions = dimensions[:num_pockets]

    # Prepare protein
    # Prepare ligand
    ligand_name = os.path.basename(ligand_file).split(".")[0]
    ligand_pdbqt = os.path.join(out_dir, "%s.pdbqt" % ligand_name)

+33 −0
Original line number Diff line number Diff line
@@ -23,6 +23,11 @@ class TestPoseGeneration(unittest.TestCase):
    """Test that VinaPoseGenerator can be initialized."""
    dc.dock.VinaPoseGenerator()

  @unittest.skipIf(IS_WINDOWS, 'Skip the test on Windows')
  def test_gnina_initialization(self):
    """Test that GninaPoseGenerator can be initialized."""
    dc.dock.GninaPoseGenerator()

  @unittest.skipIf(IS_WINDOWS, 'Skip the test on Windows')
  def test_pocket_vina_initialization(self):
    """Test that VinaPoseGenerator can be initialized."""
@@ -58,6 +63,34 @@ class TestPoseGeneration(unittest.TestCase):
    assert isinstance(protein, Chem.Mol)
    assert isinstance(ligand, Chem.Mol)

  @pytest.mark.slow
  def test_gnina_poses_and_scores(self):
    """Test that GninaPoseGenerator generates poses and scores

    This test takes some time to run, about 3 minutes on
    development laptop.
    """
    # Let's turn on logging since this test will run for a while
    logging.basicConfig(level=logging.INFO)
    current_dir = os.path.dirname(os.path.realpath(__file__))
    protein_file = os.path.join(current_dir, "1jld_protein.pdb")
    ligand_file = os.path.join(current_dir, "1jld_ligand.sdf")

    gpg = dc.dock.GninaPoseGenerator()
    with tempfile.TemporaryDirectory() as tmp:
      poses, scores = gpg.generate_poses(
          (protein_file, ligand_file),
          exhaustiveness=1,
          num_modes=1,
          out_dir=tmp)

    assert len(poses) == 1
    assert len(scores) == 1
    protein, ligand = poses[0]
    from rdkit import Chem
    assert isinstance(protein, Chem.Mol)
    assert isinstance(ligand, Chem.Mol)

  @pytest.mark.slow
  def test_vina_poses_no_scores(self):
    """Test that VinaPoseGenerator generates poses.
+2 −0
Original line number Diff line number Diff line
@@ -84,6 +84,8 @@ from deepchem.utils.pdbqt_utils import convert_protein_to_pdbqt
from deepchem.utils.pdbqt_utils import convert_mol_to_pdbqt

from deepchem.utils.vina_utils import write_vina_conf
from deepchem.utils.vina_utils import write_gnina_conf
from deepchem.utils.vina_utils import read_gnina_log
from deepchem.utils.vina_utils import load_docked_ligands
from deepchem.utils.vina_utils import prepare_inputs

+25 −0
Original line number Diff line number Diff line
              _             
             (_)            
   __ _ _ __  _ _ __   __ _ 
  / _` | '_ \| | '_ \ / _` |
 | (_| | | | | | | | | (_| |
  \__, |_| |_|_|_| |_|\__,_|
   __/ |                    
  |___/                     

gnina  master:b4a640e   Built Jan 28 2021.
gnina is based on smina and AutoDock Vina.
Please cite appropriately.

WARNING: No GPU detected. CNN scoring will be slow.
Recommend running with single model (--cnn crossdock_default2018)
or without cnn scoring (--cnn_scoring=none).

Commandline: /tmp/gnina --autobox_ligand 10gs_rec.pdb --config conf.txt --out docked.pdbqt --log log.txt
Using random seed: 2128721554

mode |  affinity  |    CNN     |   CNN
     | (kcal/mol) | pose score | affinity
-----+------------+------------+----------
    1       -4.37       0.6392      4.336
    2       -3.56       0.6202      4.162
Loading