Commit 81a00f86 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #335 from rbharath/pdbbind_fast

Making PDBBind Examples Easier to work with
parents 0e086e62 e9e11d4d
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -14,3 +14,4 @@ import deepchem.nn
import deepchem.splits
import deepchem.trans
import deepchem.utils
import deepchem.dock
+14 −0
Original line number Diff line number Diff line
"""
Imports all submodules 
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

from deepchem.dock.pose_generation import PoseGenerator
from deepchem.dock.pose_generation import VinaPoseGenerator
from deepchem.dock.pose_scoring import PoseScorer
from deepchem.dock.pose_scoring import GridPoseScorer
from deepchem.dock.docking import Docker
from deepchem.dock.docking import VinaGridRFDocker
from deepchem.dock.docking import VinaGridDNNDocker
+84 −0
Original line number Diff line number Diff line
"""
Docks protein-ligand pairs 
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "GPL"

import numpy as np
import os
import tempfile
from deepchem.feat import GridFeaturizer
from deepchem.data import DiskDataset
from deepchem.models import SklearnModel
from deepchem.models import TensorflowMultiTaskRegressor
from deepchem.dock.pose_scoring import GridPoseScorer
from deepchem.dock.pose_generation import VinaPoseGenerator
from sklearn.ensemble import RandomForestRegressor
from subprocess import call

class Docker(object):
  """Abstract Class specifying API for Docking."""

  def dock(self, protein_file, ligand_file):
    raise NotImplementedError

class VinaGridRFDocker(Docker):
  """Vina pose-generation, RF-models on grid-featurization of complexes."""

  def __init__(self):
    """Builds model."""
    self.base_dir = tempfile.mkdtemp()
    print("About to download trained model.")
    call(("wget http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/random_full_RF.tar.gz").split())
    call(("tar -zxvf random_full_RF.tar.gz").split())
    call(("mv random_full_RF %s" % (self.base_dir)).split())
    self.model_dir = os.path.join(self.base_dir, "random_full_RF")

    # Fit model on dataset
    model = SklearnModel(model_dir=self.model_dir)
    model.reload()

    self.pose_scorer = GridPoseScorer(model, feat="grid")
    self.pose_generator = VinaPoseGenerator() 

  def dock(self, protein_file, ligand_file):
    """Docks using Vina and RF."""
    protein_docked, ligand_docked = self.pose_generator.generate_poses(
        protein_file, ligand_file)
    score = self.pose_scorer.score(protein_docked, ligand_docked)
    return (score, (protein_docked, ligand_docked))

class VinaGridDNNDocker(object):
  """Vina pose-generation, DNN-models on grid-featurization of complexes."""

  def __init__(self, n_trees=100):
    """Builds model."""
    self.base_dir = tempfile.mkdtemp()
    print("About to download trained model.")
    call(("wget http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/random_full_DNN.tar.gz").split())
    call(("tar -zxvf random_full_DNN.tar.gz").split())
    call(("mv random_full_DNN %s" % (self.base_dir)).split())
    self.model_dir = os.path.join(self.base_dir, "random_full_DNN")

    # Fit model on dataset
    pdbbind_tasks = ["-logKd/Ki"]
    n_features = 2052
    model = TensorflowMultiTaskRegressor(
        len(pdbbind_tasks), n_features, logdir=self.model_dir, dropouts=[.25],
        learning_rate=0.0003, weight_init_stddevs=[.1], batch_size=64)
    model.reload()

    self.pose_scorer = GridPoseScorer(model, feat="grid")
    self.pose_generator = VinaPoseGenerator() 

  def dock(self, protein_file, ligand_file):
    """Docks using Vina and DNNs."""
    protein_docked, ligand_docked = self.pose_generator.generate_poses(
        protein_file, ligand_file)
    score = self.pose_scorer.score(protein_docked, ligand_docked)
    return (score, (protein_docked, ligand_docked))
+129 −0
Original line number Diff line number Diff line
"""
Generates protein-ligand docked poses using Autodock Vina.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "GPL"

import numpy as np
import os
import pybel
import tempfile
from deepchem.feat import hydrogenate_and_compute_partial_charges
from subprocess import call

class PoseGenerator(object):
  """Abstract superclass for all pose-generation routines."""

  def generate_poses(self, protein_file, ligand_file, out_dir=None):
    """Generates the docked complex and outputs files for docked complex."""
    raise NotImplementedError

def write_conf(receptor_filename, ligand_filename, centroid, box_dims,
               conf_filename, exhaustiveness=None):
  """Writes Vina configuration file to disk."""
  with open(conf_filename, "w") as f:
    f.write("receptor = %s\n" % receptor_filename)
    f.write("ligand = %s\n\n" % ligand_filename)

    f.write("center_x = %f\n" % centroid[0])
    f.write("center_y = %f\n" % centroid[1])
    f.write("center_z = %f\n\n" % centroid[2])

    f.write("size_x = %f\n" % box_dims[0])
    f.write("size_y = %f\n" % box_dims[1])
    f.write("size_z = %f\n\n" % box_dims[2])

    if exhaustiveness is not None:
      f.write("exhaustiveness = %d\n" % exhaustiveness)

def get_molecule_data(pybel_molecule):
  """Uses pybel to compute centroid and range of molecule (Angstroms)."""
  atom_positions = []
  for atom in pybel_molecule:
    atom_positions.append(atom.coords)
  num_atoms = len(atom_positions)
  protein_xyz = np.asarray(atom_positions)
  protein_centroid = np.mean(protein_xyz, axis=0)
  protein_max = np.max(protein_xyz, axis=0)
  protein_min = np.min(protein_xyz, axis=0)
  protein_range = protein_max - protein_min
  return protein_centroid, protein_range


class VinaPoseGenerator(PoseGenerator):

  def __init__(self, exhaustiveness=1):
    """Initializes Vina Pose generation"""
    current_dir = os.path.dirname(os.path.realpath(__file__))
    self.vina_dir = os.path.join(current_dir, "autodock_vina_1_1_2_linux_x86")
    self.exhaustiveness = exhaustiveness
    if not os.path.exists(self.vina_dir):
      print("Vina not available. Downloading")
      # TODO(rbharath): May want to move this file to S3 so we can ensure it's
      # always available.
      wget_cmd = "wget http://vina.scripps.edu/download/autodock_vina_1_1_2_linux_x86.tgz"
      call(wget_cmd.split())
      print("Downloaded Vina. Extracting")
      download_cmd = "tar xzvf autodock_vina_1_1_2_linux_x86.tgz"
      call(download_cmd.split())
      print("Moving to final location")
      mv_cmd = "mv autodock_vina_1_1_2_linux_x86 %s" % current_dir
      call(mv_cmd.split())
      print("Cleanup: removing downloaded vina tar.gz")
      rm_cmd = "rm autodock_vina_1_1_2_linux_x86.tgz"
      call(rm_cmd.split())
    self.vina_cmd = os.path.join(self.vina_dir, "bin/vina")
      

  def generate_poses(self, protein_file, ligand_file, out_dir=None):
    """Generates the docked complex and outputs files for docked complex."""
    if out_dir is None:
      out_dir = tempfile.mkdtemp()

    # Prepare receptor 
    receptor_name = os.path.basename(protein_file).split(".")[0]
    protein_hyd = os.path.join(out_dir, "%s.pdb" % receptor_name)
    protein_pdbqt = os.path.join(out_dir, "%s.pdbqt" % receptor_name)
    hydrogenate_and_compute_partial_charges(protein_file, "pdb",
                                            hyd_output=protein_hyd,
                                            pdbqt_output=protein_pdbqt,
                                            protein=True)
    # Get protein centroid and range
    receptor_pybel = next(pybel.readfile(str("pdb"), str(protein_hyd)))
    # TODO(rbharath): Need to add some way to identify binding pocket, or this is
    # going to be extremely slow!
    protein_centroid, protein_range = get_molecule_data(receptor_pybel)
    box_dims = protein_range + 5.0

    # Prepare receptor
    ligand_name = os.path.basename(ligand_file).split(".")[0]
    ligand_hyd = os.path.join(out_dir, "%s.pdb" % ligand_name)
    ligand_pdbqt = os.path.join(out_dir, "%s.pdbqt" % ligand_name)

    # TODO(rbharath): Generalize this so can support mol2 files as well.
    hydrogenate_and_compute_partial_charges(ligand_file, "sdf",
                                            hyd_output=ligand_hyd,
                                            pdbqt_output=ligand_pdbqt,
                                            protein=False)

    # Write Vina conf file
    conf_file = os.path.join(out_dir, "conf.txt")
    write_conf(protein_pdbqt, ligand_pdbqt, protein_centroid,
               box_dims, conf_file, exhaustiveness=self.exhaustiveness)

    # Define locations of log and output files
    log_file = os.path.join(out_dir, "%s_log.txt" % ligand_name)
    out_pdbqt = os.path.join(out_dir, "%s_docked.pdbqt" % ligand_name)
    # TODO(rbharath): Let user specify the number of poses required.
    print("About to call Vina")
    call("%s --config %s --log %s --out %s"
         % (self.vina_cmd, conf_file, log_file, out_pdbqt), shell=True)
    # TODO(rbharath): Convert the output pdbqt to a pdb file.

    # Return docked files 
    return protein_hyd, out_pdbqt
+49 −0
Original line number Diff line number Diff line
"""
Scores protein-ligand poses using DeepChem.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "GPL"

import numpy as np
import os
import tempfile
from deepchem.feat import GridFeaturizer
from deepchem.data import NumpyDataset
from subprocess import call

class PoseScorer(object):
  """Abstract superclass for all scoring methods."""

  def score(self, protein_file, ligand_file):
    """Returns a score for a protein/ligand pair."""
    raise NotImplementedError

class GridPoseScorer(object):

  def __init__(self, model, feat="grid"):
    """Initializes a pose-scorer."""
    self.model = model
    if feat == "grid":
      self.featurizer = GridFeaturizer(
          voxel_width=16.0, feature_types="voxel_combined",
          # TODO(rbharath, enf): Figure out why pi_stack is slow and cation_pi
          # causes segfaults.
          #voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi",
          #"salt_bridge"], ecfp_power=9, splif_power=9,
          voxel_feature_types=["ecfp", "splif", "hbond", "salt_bridge"],
          ecfp_power=9, splif_power=9,
          parallel=True, flatten=True)
    else:
      raise ValueError("feat not defined.")

  def score(self, protein_file, ligand_file):
    """Returns a score for a protein/ligand pair."""
    features = self.featurizer.featurize_complexes([ligand_file], [protein_file])
    dataset = NumpyDataset(X=features, y=None, w=None, ids=None)
    score = self.model.predict(dataset)
    return score
Loading