Commit 767e32f2 authored by miaecle's avatar miaecle
Browse files

Merge branch 'master' of https://github.com/deepchem/deepchem into nci_ref

parents 218f3d0a 5e15315e
Loading
Loading
Loading
Loading
+9 −5
Original line number Diff line number Diff line
@@ -251,9 +251,12 @@ Scaffold splitting
|           |graphconv regression|Random      |0.996         |0.873         |
|           |MT-NN regression    |Scaffold    |0.782         |0.426         |
|           |graphconv regression|Scaffold    |0.994         |0.606         |
|nci        |MT-NN regression    |Index       |0.890         |0.890         |
|           |MT-NN regression    |Random      |0.891         |0.888         |
|           |MT-NN regression    |Scaffold    |0.912         |0.020         |
|nci        |MT-NN regression    |Index       |0.171         |0.062         |
|           |graphconv regression|Index       |0.123         |0.048         |
|           |MT-NN regression    |Random      |0.168         |0.085         |
|           |graphconv regression|Random      |0.117         |0.076         |
|           |MT-NN regression    |Scaffold    |0.180         |0.052         |
|           |graphconv regression|Scaffold    |0.131         |0.046         |
|kaggle     |MT-NN regression    |User-defined|0.748         |0.452         |

* General features
@@ -269,7 +272,7 @@ Number of tasks and examples in the datasets
|toxcast    |617        |8615       |
|delaney    |1          |1128       |
|kaggle     |15         |173065     |
|nci        |60         |1057371    |
|nci        |60         |19127      |

Time needed for benchmark test(~20h in total)

@@ -297,7 +300,8 @@ Time needed for benchmark test(~20h in total)
|           |graph convolution   |80              |900            |
|delaney    |MT-NN regression    |10              |40             |
|           |graphconv regression|10              |40             |
|nci        |MT-NN regression    |2000            |30000          |
|nci        |MT-NN regression    |400             |1200           |
|           |graphconv regression|400             |2500           |
|kaggle     |MT-NN regression    |2200            |3200           |


+9 −4
Original line number Diff line number Diff line
@@ -225,13 +225,18 @@ class Dataset(object):
class NumpyDataset(Dataset):
  """A Dataset defined by in-memory numpy arrays."""

  def __init__(self, X, y, w=None, ids=None):
  def __init__(self, X, y=None, w=None, ids=None):
    n_samples = len(X)
    # The -1 indicates that y will be reshaped to have length -1
    if n_samples > 0:
      if y is not None:
        y = np.reshape(y, (n_samples, -1))
        if w is not None:
          w = np.reshape(w, (n_samples, -1))
      else:
        # Set labels to be zero, with zero weights
        y = np.zeros((n_samples, 1))
        w = np.zeros_like(y)
    n_tasks = y.shape[1]
    if ids is None:
      ids = np.arange(n_samples)
+1 −0
Original line number Diff line number Diff line
@@ -13,3 +13,4 @@ from deepchem.dock.docking import Docker
from deepchem.dock.docking import VinaGridRFDocker
from deepchem.dock.docking import VinaGridDNNDocker
from deepchem.dock.binding_pocket import ConvexHullPocketFinder
from deepchem.dock.binding_pocket import RFConvexHullPocketFinder
+108 −11
Original line number Diff line number Diff line
@@ -9,20 +9,25 @@ __author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2017, Stanford University"
__license__ = "GPL"

import numpy as np
import os
import pybel
import tempfile
import numpy as np
import openbabel as ob
from rdkit import Chem
from subprocess import call
from scipy.spatial import ConvexHull
from deepchem.feat import hydrogenate_and_compute_partial_charges
from deepchem.feat.atomic_coordinates import AtomicCoordinates
from deepchem.feat.grid_featurizer import load_molecule
from subprocess import call
from deepchem.feat.binding_pocket_features import BindingPocketFeaturizer 
from deepchem.feat.fingerprints import CircularFingerprint 
from deepchem.models.sklearn_models import SklearnModel
from deepchem.data.datasets import NumpyDataset

def extract_active_site(protein_file, ligand_file, cutoff=4):
  """Extracts a box for the active site."""
  protein_coords = load_molecule(protein_file)[0]
  ligand_coords = load_molecule(ligand_file)[0]
  protein_coords = load_molecule(protein_file, add_hydrogens=False)[0]
  ligand_coords = load_molecule(ligand_file, add_hydrogens=False)[0]
  num_ligand_atoms = len(ligand_coords)
  num_protein_atoms = len(protein_coords)
  pocket_inds = []
@@ -164,7 +169,7 @@ def merge_overlapping_boxes(mapping, boxes, threshold=.8):
class BindingPocketFinder(object):
  """Abstract superclass for binding pocket detectors"""

  def find_pockets(self, protein_file):
  def find_pockets(self, protein_file, ligand_file):
    """Finds potential binding pockets in proteins."""
    raise NotImplementedError

@@ -179,14 +184,106 @@ class ConvexHullPocketFinder(BindingPocketFinder):
  def find_all_pockets(self, protein_file):
    """Find list of binding pockets on protein."""
    # protein_coords is (N, 3) tensor
    coords = load_molecule(protein_file)[0]
    coords = load_molecule(protein_file, add_hydrogens=False)[0]
    return get_all_boxes(coords, self.pad)

  def find_pockets(self, protein_file, ligand_file):
    """Find list of suitable binding pockets on protein."""
    protein_coords = load_molecule(protein_file)[0]
    ligand_coords = load_molecule(ligand_file)[0]
    protein_coords = load_molecule(protein_file, add_hydrogens=False)[0]
    ligand_coords = load_molecule(ligand_file, add_hydrogens=False)[0]
    boxes = get_all_boxes(protein_coords, self.pad)
    mapping = boxes_to_atoms(protein_coords, boxes)
    merged_boxes, mapping = merge_overlapping_boxes(mapping, boxes)
    return merged_boxes, mapping
    pockets, pocket_atoms_map = merge_overlapping_boxes(mapping, boxes)
    pocket_coords = []
    for pocket in pockets:
      atoms = pocket_atoms_map[pocket]
      coords = np.zeros((len(atoms), 3))
      for ind, atom in enumerate(atoms):
        coords[ind] = protein_coords[atom]
      pocket_coords.append(coords)
    return pockets, pocket_atoms_map, pocket_coords

class RFConvexHullPocketFinder(BindingPocketFinder):
  """Uses pre-trained RF model + ConvexHulPocketFinder to select pockets."""

  def __init__(self, pad=5):
    self.pad = pad
    self.convex_finder = ConvexHullPocketFinder(pad)

    # Load binding pocket model
    self.base_dir = tempfile.mkdtemp()
    print("About to download trained model.")
    # TODO(rbharath): Shift refined to full once trained.
    call(("wget -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/pocket_random_refined_RF.tar.gz").split())
    call(("tar -zxvf pocket_random_refined_RF.tar.gz").split())
    call(("mv pocket_random_refined_RF %s" % (self.base_dir)).split())
    self.model_dir = os.path.join(self.base_dir, "pocket_random_refined_RF")

    # Fit model on dataset
    self.model = SklearnModel(model_dir=self.model_dir)
    self.model.reload()

    # Create featurizers
    self.pocket_featurizer = BindingPocketFeaturizer()
    self.ligand_featurizer = CircularFingerprint(size=1024)

  def find_pockets(self, protein_file, ligand_file):
    """Compute features for a given complex

    TODO(rbharath): This has a log of code overlap with
    compute_binding_pocket_features in
    examples/binding_pockets/binding_pocket_datasets.py. Find way to refactor
    to avoid code duplication.
    """
    if not ligand_file.endswith(".sdf"):
      raise ValueError("Only .sdf ligand files can be featurized.")
    ligand_basename = os.path.basename(ligand_file).split(".")[0]
    ligand_mol2 = os.path.join(
        self.base_dir, ligand_basename + ".mol2")

    # Write mol2 file for ligand
    obConversion = ob.OBConversion()
    conv_out = obConversion.SetInAndOutFormats(str("sdf"), str("mol2"))
    ob_mol = ob.OBMol()
    obConversion.ReadFile(ob_mol, str(ligand_file))
    obConversion.WriteFile(ob_mol, str(ligand_mol2))
      
    # Featurize ligand
    mol = Chem.MolFromMol2File(str(ligand_mol2), removeHs=False)
    if mol is None:
      return None, None
    # Default for CircularFingerprint
    n_ligand_features = 1024
    ligand_features = self.ligand_featurizer.featurize([mol])

    # Featurize pocket
    pockets, pocket_atoms_map, pocket_coords = self.convex_finder.find_pockets(
        protein_file, ligand_file)
    n_pockets = len(pockets)
    n_pocket_features = BindingPocketFeaturizer.n_features

    features = np.zeros((n_pockets, n_pocket_features+n_ligand_features))
    pocket_features = self.pocket_featurizer.featurize(
        protein_file, pockets, pocket_atoms_map, pocket_coords)
    # Note broadcast operation
    features[:, :n_pocket_features] = pocket_features
    features[:, n_pocket_features:] = ligand_features
    dataset = NumpyDataset(X=features)
    pocket_preds = self.model.predict(dataset)
    pocket_pred_proba = np.squeeze(self.model.predict_proba(dataset))

    # Find pockets which are active
    active_pockets = []
    active_pocket_atoms_map = {}
    active_pocket_coords = []
    for pocket_ind in range(len(pockets)):
      #################################################### DEBUG
      # TODO(rbharath): For now, using a weak cutoff. Fix later.
      #if pocket_preds[pocket_ind] == 1:
      if pocket_pred_proba[pocket_ind][1] > .15:
      #################################################### DEBUG
        pocket = pockets[pocket_ind]
        active_pockets.append(pocket)
        active_pocket_atoms_map[pocket] = pocket_atoms_map[pocket]
        active_pocket_coords.append(pocket_coords[pocket_ind])
    return active_pockets, active_pocket_atoms_map, active_pocket_coords
+6 −4
Original line number Diff line number Diff line
@@ -30,7 +30,7 @@ class Docker(object):
class VinaGridRFDocker(Docker):
  """Vina pose-generation, RF-models on grid-featurization of complexes."""

  def __init__(self):
  def __init__(self, exhaustiveness=10, detect_pockets=False):
    """Builds model."""
    self.base_dir = tempfile.mkdtemp()
    print("About to download trained model.")
@@ -44,7 +44,8 @@ class VinaGridRFDocker(Docker):
    model.reload()

    self.pose_scorer = GridPoseScorer(model, feat="grid")
    self.pose_generator = VinaPoseGenerator() 
    self.pose_generator = VinaPoseGenerator(
        exhaustiveness=exhaustiveness, detect_pockets=detect_pockets) 

  def dock(self, protein_file, ligand_file):
    """Docks using Vina and RF."""
@@ -56,7 +57,7 @@ class VinaGridRFDocker(Docker):
class VinaGridDNNDocker(object):
  """Vina pose-generation, DNN-models on grid-featurization of complexes."""

  def __init__(self, n_trees=100):
  def __init__(self, exhaustiveness=10, detect_pockets=False):
    """Builds model."""
    self.base_dir = tempfile.mkdtemp()
    print("About to download trained model.")
@@ -74,7 +75,8 @@ class VinaGridDNNDocker(object):
    model.reload()

    self.pose_scorer = GridPoseScorer(model, feat="grid")
    self.pose_generator = VinaPoseGenerator() 
    self.pose_generator = VinaPoseGenerator(
        exhaustiveness=exhaustiveness, detect_pockets=detect_pockets) 

  def dock(self, protein_file, ligand_file):
    """Docks using Vina and DNNs."""
Loading