Commit 1a05c220 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #340 from rbharath/binding_pocket_feat

Binding Pocket Featurizer
parents 14d4ffa6 fe33e352
Loading
Loading
Loading
Loading
+9 −4
Original line number Diff line number Diff line
@@ -225,13 +225,18 @@ class Dataset(object):
class NumpyDataset(Dataset):
  """A Dataset defined by in-memory numpy arrays."""

  def __init__(self, X, y, w=None, ids=None):
  def __init__(self, X, y=None, w=None, ids=None):
    n_samples = len(X)
    # The -1 indicates that y will be reshaped to have length -1
    if n_samples > 0:
      if y is not None:
        y = np.reshape(y, (n_samples, -1))
        if w is not None:
          w = np.reshape(w, (n_samples, -1))
      else:
        # Set labels to be zero, with zero weights
        y = np.zeros((n_samples, 1))
        w = np.zeros_like(y)
    n_tasks = y.shape[1]
    if ids is None:
      ids = np.arange(n_samples)
+1 −0
Original line number Diff line number Diff line
@@ -13,3 +13,4 @@ from deepchem.dock.docking import Docker
from deepchem.dock.docking import VinaGridRFDocker
from deepchem.dock.docking import VinaGridDNNDocker
from deepchem.dock.binding_pocket import ConvexHullPocketFinder
from deepchem.dock.binding_pocket import RFConvexHullPocketFinder
+108 −11
Original line number Diff line number Diff line
@@ -9,20 +9,25 @@ __author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2017, Stanford University"
__license__ = "GPL"

import numpy as np
import os
import pybel
import tempfile
import numpy as np
import openbabel as ob
from rdkit import Chem
from subprocess import call
from scipy.spatial import ConvexHull
from deepchem.feat import hydrogenate_and_compute_partial_charges
from deepchem.feat.atomic_coordinates import AtomicCoordinates
from deepchem.feat.grid_featurizer import load_molecule
from subprocess import call
from deepchem.feat.binding_pocket_features import BindingPocketFeaturizer 
from deepchem.feat.fingerprints import CircularFingerprint 
from deepchem.models.sklearn_models import SklearnModel
from deepchem.data.datasets import NumpyDataset

def extract_active_site(protein_file, ligand_file, cutoff=4):
  """Extracts a box for the active site."""
  protein_coords = load_molecule(protein_file)[0]
  ligand_coords = load_molecule(ligand_file)[0]
  protein_coords = load_molecule(protein_file, add_hydrogens=False)[0]
  ligand_coords = load_molecule(ligand_file, add_hydrogens=False)[0]
  num_ligand_atoms = len(ligand_coords)
  num_protein_atoms = len(protein_coords)
  pocket_inds = []
@@ -164,7 +169,7 @@ def merge_overlapping_boxes(mapping, boxes, threshold=.8):
class BindingPocketFinder(object):
  """Abstract superclass for binding pocket detectors"""

  def find_pockets(self, protein_file):
  def find_pockets(self, protein_file, ligand_file):
    """Finds potential binding pockets in proteins."""
    raise NotImplementedError

@@ -179,14 +184,106 @@ class ConvexHullPocketFinder(BindingPocketFinder):
  def find_all_pockets(self, protein_file):
    """Find list of binding pockets on protein."""
    # protein_coords is (N, 3) tensor
    coords = load_molecule(protein_file)[0]
    coords = load_molecule(protein_file, add_hydrogens=False)[0]
    return get_all_boxes(coords, self.pad)

  def find_pockets(self, protein_file, ligand_file):
    """Find list of suitable binding pockets on protein."""
    protein_coords = load_molecule(protein_file)[0]
    ligand_coords = load_molecule(ligand_file)[0]
    protein_coords = load_molecule(protein_file, add_hydrogens=False)[0]
    ligand_coords = load_molecule(ligand_file, add_hydrogens=False)[0]
    boxes = get_all_boxes(protein_coords, self.pad)
    mapping = boxes_to_atoms(protein_coords, boxes)
    merged_boxes, mapping = merge_overlapping_boxes(mapping, boxes)
    return merged_boxes, mapping
    pockets, pocket_atoms_map = merge_overlapping_boxes(mapping, boxes)
    pocket_coords = []
    for pocket in pockets:
      atoms = pocket_atoms_map[pocket]
      coords = np.zeros((len(atoms), 3))
      for ind, atom in enumerate(atoms):
        coords[ind] = protein_coords[atom]
      pocket_coords.append(coords)
    return pockets, pocket_atoms_map, pocket_coords

class RFConvexHullPocketFinder(BindingPocketFinder):
  """Uses pre-trained RF model + ConvexHulPocketFinder to select pockets."""

  def __init__(self, pad=5):
    self.pad = pad
    self.convex_finder = ConvexHullPocketFinder(pad)

    # Load binding pocket model
    self.base_dir = tempfile.mkdtemp()
    print("About to download trained model.")
    # TODO(rbharath): Shift refined to full once trained.
    call(("wget -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/pocket_random_refined_RF.tar.gz").split())
    call(("tar -zxvf pocket_random_refined_RF.tar.gz").split())
    call(("mv pocket_random_refined_RF %s" % (self.base_dir)).split())
    self.model_dir = os.path.join(self.base_dir, "pocket_random_refined_RF")

    # Fit model on dataset
    self.model = SklearnModel(model_dir=self.model_dir)
    self.model.reload()

    # Create featurizers
    self.pocket_featurizer = BindingPocketFeaturizer()
    self.ligand_featurizer = CircularFingerprint(size=1024)

  def find_pockets(self, protein_file, ligand_file):
    """Compute features for a given complex

    TODO(rbharath): This has a log of code overlap with
    compute_binding_pocket_features in
    examples/binding_pockets/binding_pocket_datasets.py. Find way to refactor
    to avoid code duplication.
    """
    if not ligand_file.endswith(".sdf"):
      raise ValueError("Only .sdf ligand files can be featurized.")
    ligand_basename = os.path.basename(ligand_file).split(".")[0]
    ligand_mol2 = os.path.join(
        self.base_dir, ligand_basename + ".mol2")

    # Write mol2 file for ligand
    obConversion = ob.OBConversion()
    conv_out = obConversion.SetInAndOutFormats(str("sdf"), str("mol2"))
    ob_mol = ob.OBMol()
    obConversion.ReadFile(ob_mol, str(ligand_file))
    obConversion.WriteFile(ob_mol, str(ligand_mol2))
      
    # Featurize ligand
    mol = Chem.MolFromMol2File(str(ligand_mol2), removeHs=False)
    if mol is None:
      return None, None
    # Default for CircularFingerprint
    n_ligand_features = 1024
    ligand_features = self.ligand_featurizer.featurize([mol])

    # Featurize pocket
    pockets, pocket_atoms_map, pocket_coords = self.convex_finder.find_pockets(
        protein_file, ligand_file)
    n_pockets = len(pockets)
    n_pocket_features = BindingPocketFeaturizer.n_features

    features = np.zeros((n_pockets, n_pocket_features+n_ligand_features))
    pocket_features = self.pocket_featurizer.featurize(
        protein_file, pockets, pocket_atoms_map, pocket_coords)
    # Note broadcast operation
    features[:, :n_pocket_features] = pocket_features
    features[:, n_pocket_features:] = ligand_features
    dataset = NumpyDataset(X=features)
    pocket_preds = self.model.predict(dataset)
    pocket_pred_proba = np.squeeze(self.model.predict_proba(dataset))

    # Find pockets which are active
    active_pockets = []
    active_pocket_atoms_map = {}
    active_pocket_coords = []
    for pocket_ind in range(len(pockets)):
      #################################################### DEBUG
      # TODO(rbharath): For now, using a weak cutoff. Fix later.
      #if pocket_preds[pocket_ind] == 1:
      if pocket_pred_proba[pocket_ind][1] > .15:
      #################################################### DEBUG
        pocket = pockets[pocket_ind]
        active_pockets.append(pocket)
        active_pocket_atoms_map[pocket] = pocket_atoms_map[pocket]
        active_pocket_coords.append(pocket_coords[pocket_ind])
    return active_pockets, active_pocket_atoms_map, active_pocket_coords
+6 −4
Original line number Diff line number Diff line
@@ -30,7 +30,7 @@ class Docker(object):
class VinaGridRFDocker(Docker):
  """Vina pose-generation, RF-models on grid-featurization of complexes."""

  def __init__(self):
  def __init__(self, exhaustiveness=10, detect_pockets=False):
    """Builds model."""
    self.base_dir = tempfile.mkdtemp()
    print("About to download trained model.")
@@ -44,7 +44,8 @@ class VinaGridRFDocker(Docker):
    model.reload()

    self.pose_scorer = GridPoseScorer(model, feat="grid")
    self.pose_generator = VinaPoseGenerator() 
    self.pose_generator = VinaPoseGenerator(
        exhaustiveness=exhaustiveness, detect_pockets=detect_pockets) 

  def dock(self, protein_file, ligand_file):
    """Docks using Vina and RF."""
@@ -56,7 +57,7 @@ class VinaGridRFDocker(Docker):
class VinaGridDNNDocker(object):
  """Vina pose-generation, DNN-models on grid-featurization of complexes."""

  def __init__(self, n_trees=100):
  def __init__(self, exhaustiveness=10, detect_pockets=False):
    """Builds model."""
    self.base_dir = tempfile.mkdtemp()
    print("About to download trained model.")
@@ -74,7 +75,8 @@ class VinaGridDNNDocker(object):
    model.reload()

    self.pose_scorer = GridPoseScorer(model, feat="grid")
    self.pose_generator = VinaPoseGenerator() 
    self.pose_generator = VinaPoseGenerator(
        exhaustiveness=exhaustiveness, detect_pockets=detect_pockets) 

  def dock(self, protein_file, ligand_file):
    """Docks using Vina and DNNs."""
+26 −4
Original line number Diff line number Diff line
@@ -13,8 +13,9 @@ import numpy as np
import os
import pybel
import tempfile
from deepchem.feat import hydrogenate_and_compute_partial_charges
from subprocess import call
from deepchem.feat import hydrogenate_and_compute_partial_charges
from deepchem.dock.binding_pocket import RFConvexHullPocketFinder

class PoseGenerator(object):
  """Abstract superclass for all pose-generation routines."""
@@ -56,12 +57,16 @@ def get_molecule_data(pybel_molecule):


class VinaPoseGenerator(PoseGenerator):
  """Uses Autodock Vina to generate binding poses."""

  def __init__(self, exhaustiveness=1):
  def __init__(self, exhaustiveness=10, detect_pockets=True):
    """Initializes Vina Pose generation"""
    current_dir = os.path.dirname(os.path.realpath(__file__))
    self.vina_dir = os.path.join(current_dir, "autodock_vina_1_1_2_linux_x86")
    self.exhaustiveness = exhaustiveness
    self.detect_pockets = detect_pockets
    if self.detect_pockets:
      self.pocket_finder = RFConvexHullPocketFinder()
    if not os.path.exists(self.vina_dir):
      print("Vina not available. Downloading")
      # TODO(rbharath): May want to move this file to S3 so we can ensure it's
@@ -97,8 +102,25 @@ class VinaPoseGenerator(PoseGenerator):
    receptor_pybel = next(pybel.readfile(str("pdb"), str(protein_hyd)))
    # TODO(rbharath): Need to add some way to identify binding pocket, or this is
    # going to be extremely slow!
    if not self.detect_pockets:
      protein_centroid, protein_range = get_molecule_data(receptor_pybel)
      box_dims = protein_range + 5.0
    else:
      print("About to find putative binding pockets")
      pockets, pocket_atoms_maps, pocket_coords = self.pocket_finder.find_pockets(
          protein_file, ligand_file)
      # TODO(rbharath): Handle multiple pockets instead of arbitrarily selecting
      # first pocket. 
      print("Computing centroid and size of proposed pocket.")
      pocket_coord = pocket_coords[0]
      protein_centroid = np.mean(pocket_coord, axis=1)
      pocket = pockets[0]
      (x_min, x_max), (y_min, y_max), (z_min, z_max) = pocket
      x_box = (x_max - x_min)/2.
      y_box = (y_max - y_min)/2.
      z_box = (z_max - z_min)/2.
      box_dims = (x_box, y_box, z_box)


    # Prepare receptor
    ligand_name = os.path.basename(ligand_file).split(".")[0]
Loading