Commit e392516f authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Utils

parent 83c4b3f4
Loading
Loading
Loading
Loading
+82 −258
Original line number Diff line number Diff line
"""
Computes putative binding pockets on protein.
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2017, Stanford University"
__license__ = "MIT"

import os
import logging
import tempfile
import numpy as np
from subprocess import call
from scipy.spatial import ConvexHull
from deepchem.feat.binding_pocket_features import BindingPocketFeaturizer
from deepchem.feat.fingerprints import CircularFingerprint
from deepchem.models.sklearn_models import SklearnModel
from deepchem.utils import rdkit_util
from deepchem.utils import coordinate_box_utils as box_utils
from deepchem.utils.fragment_util import get_contact_atom_indices

logger = logging.getLogger(__name__)


def extract_active_site(protein_file, ligand_file, cutoff=4):
  """Extracts a box for the active site."""
  protein_coords = rdkit_util.load_molecule(
      protein_file, add_hydrogens=False)[0]
  ligand_coords = rdkit_util.load_molecule(
      ligand_file, add_hydrogens=True, calc_charges=True)[0]
  num_ligand_atoms = len(ligand_coords)
  num_protein_atoms = len(protein_coords)
  pocket_inds = []
  pocket_atoms = set([])
  for lig_atom_ind in range(num_ligand_atoms):
    lig_atom = ligand_coords[lig_atom_ind]
    for protein_atom_ind in range(num_protein_atoms):
      protein_atom = protein_coords[protein_atom_ind]
      if np.linalg.norm(lig_atom - protein_atom) < cutoff:
        if protein_atom_ind not in pocket_atoms:
          pocket_atoms = pocket_atoms.union(set([protein_atom_ind]))
  # Should be an array of size (n_pocket_atoms, 3)
  pocket_atoms = list(pocket_atoms)
  n_pocket_atoms = len(pocket_atoms)
  pocket_coords = np.zeros((n_pocket_atoms, 3))
  for ind, pocket_ind in enumerate(pocket_atoms):
    pocket_coords[ind] = protein_coords[pocket_ind]
  """Extracts a box for the active site.

  Params
  ------
  protein_file: str
    Location of protein PDB
  ligand_file: str
    Location of ligand input file
  cutoff: int, optional
    The distance in angstroms from the protein pocket to
    consider for featurization.
  """
  protein = rdkit_util.load_molecule(
      protein_file, add_hydrogens=False)
  ligand = rdkit_util.load_molecule(
      ligand_file, add_hydrogens=True, calc_charges=True)
  protein_contacts, ligand_contacts = get_contact_atom_indices([protein, ligand], cutoff=cutoff)
  protein_coords = protein[0]
  pocket_coords = protein_coords[protein_contacts]

  x_min = int(np.floor(np.amin(pocket_coords[:, 0])))
  x_max = int(np.ceil(np.amax(pocket_coords[:, 0])))
@@ -49,131 +41,32 @@ def extract_active_site(protein_file, ligand_file, cutoff=4):
  y_max = int(np.ceil(np.amax(pocket_coords[:, 1])))
  z_min = int(np.floor(np.amin(pocket_coords[:, 2])))
  z_max = int(np.ceil(np.amax(pocket_coords[:, 2])))
  return (((x_min, x_max), (y_min, y_max), (z_min, z_max)), pocket_atoms,
          pocket_coords)


def compute_overlap(mapping, box1, box2):
  """Computes overlap between the two boxes.

  Overlap is defined as % atoms of box1 in box2. Note that
  overlap is not a symmetric measurement.
  """
  atom1 = set(mapping[box1])
  atom2 = set(mapping[box2])
  return len(atom1.intersection(atom2)) / float(len(atom1))

  box = box_utils.CoordinateBox((x_min, x_max), (y_min, y_max), (z_min, z_max))
  return (box, pocket_coords)

def get_all_boxes(coords, pad=5):
  """Get all pocket boxes for protein coords.

  We pad all boxes the prescribed number of angstroms.

  TODO(rbharath): It looks like this may perhaps be non-deterministic?
  """
  hull = ConvexHull(coords)
  boxes = []
  for triangle in hull.simplices:
    # coords[triangle, 0] gives the x-dimension of all triangle points
    # Take transpose to make sure rows correspond to atoms.
    points = np.array(
        [coords[triangle, 0], coords[triangle, 1], coords[triangle, 2]]).T
    # We voxelize so all grids have integral coordinates (convenience)
    x_min, x_max = np.amin(points[:, 0]), np.amax(points[:, 0])
    x_min, x_max = int(np.floor(x_min)) - pad, int(np.ceil(x_max)) + pad
    y_min, y_max = np.amin(points[:, 1]), np.amax(points[:, 1])
    y_min, y_max = int(np.floor(y_min)) - pad, int(np.ceil(y_max)) + pad
    z_min, z_max = np.amin(points[:, 2]), np.amax(points[:, 2])
    z_min, z_max = int(np.floor(z_min)) - pad, int(np.ceil(z_max)) + pad
    boxes.append(((x_min, x_max), (y_min, y_max), (z_min, z_max)))
  return boxes


def boxes_to_atoms(atom_coords, boxes):
  """Maps each box to a list of atoms in that box.

  TODO(rbharath): This does a num_atoms x num_boxes computations. Is
  there a reasonable heuristic we can use to speed this up?
class BindingPocketFinder(object):
  """Abstract superclass for binding pocket detectors

  Many times when working with a new protein or other macromolecule,
  it's not clear what zones of the macromolecule may be good targets
  for potential ligands or other molecules to interact with. This
  abstract class provides a template for child classes that
  algorithmically locate potential binding pockets that are good
  potential interaction sites.

  Note that potential interactions sites can be found by many
  different methods, and that this abstract class doesn't specify the
  technique to be used.
  """
  mapping = {}
  for box_ind, box in enumerate(boxes):
    box_atoms = []
    (x_min, x_max), (y_min, y_max), (z_min, z_max) = box
    logger.info("Handing box %d/%d" % (box_ind, len(boxes)))
    for atom_ind in range(len(atom_coords)):
      atom = atom_coords[atom_ind]
      x_cont = x_min <= atom[0] and atom[0] <= x_max
      y_cont = y_min <= atom[1] and atom[1] <= y_max
      z_cont = z_min <= atom[2] and atom[2] <= z_max
      if x_cont and y_cont and z_cont:
        box_atoms.append(atom_ind)
    mapping[box] = box_atoms
  return mapping

  def find_pockets(self, molecule):
    """Finds potential binding pockets in proteins.

def merge_boxes(box1, box2):
  """Merges two boxes."""
  (x_min1, x_max1), (y_min1, y_max1), (z_min1, z_max1) = box1
  (x_min2, x_max2), (y_min2, y_max2), (z_min2, z_max2) = box2
  x_min = min(x_min1, x_min2)
  y_min = min(y_min1, y_min2)
  z_min = min(z_min1, z_min2)
  x_max = max(x_max1, x_max2)
  y_max = max(y_max1, y_max2)
  z_max = max(z_max1, z_max2)
  return ((x_min, x_max), (y_min, y_max), (z_min, z_max))


def merge_overlapping_boxes(mapping, boxes, threshold=.8):
  """Merge boxes which have an overlap greater than threshold.

  TODO(rbharath): This merge code is terribly inelegant. It's also quadratic
  in number of boxes. It feels like there ought to be an elegant divide and
  conquer approach here. Figure out later...
    Parameters
    ----------
    molecule: object
      Some representation of a molecule.
    """
  num_boxes = len(boxes)
  outputs = []
  for i in range(num_boxes):
    box = boxes[0]
    new_boxes = []
    new_mapping = {}
    # If overlap of box with previously generated output boxes, return
    contained = False
    for output_box in outputs:
      # Carry forward mappings
      new_mapping[output_box] = mapping[output_box]
      if compute_overlap(mapping, box, output_box) == 1:
        contained = True
    if contained:
      continue
    # We know that box has at least one atom not in outputs
    unique_box = True
    for merge_box in boxes[1:]:
      overlap = compute_overlap(mapping, box, merge_box)
      if overlap < threshold:
        new_boxes.append(merge_box)
        new_mapping[merge_box] = mapping[merge_box]
      else:
        # Current box has been merged into box further down list.
        # No need to output current box
        unique_box = False
        merged = merge_boxes(box, merge_box)
        new_boxes.append(merged)
        new_mapping[merged] = list(
            set(mapping[box]).union(set(mapping[merge_box])))
    if unique_box:
      outputs.append(box)
      new_mapping[box] = mapping[box]
    boxes = new_boxes
    mapping = new_mapping
  return outputs, mapping


class BindingPocketFinder(object):
  """Abstract superclass for binding pocket detectors"""

  def find_pockets(self, protein_file, ligand_file):
    """Finds potential binding pockets in proteins."""
    raise NotImplementedError


@@ -183,119 +76,50 @@ class ConvexHullPocketFinder(BindingPocketFinder):
  Based on https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4112621/pdf/1472-6807-14-18.pdf
  """

  def __init__(self, pad=5):
  def __init__(self, scoring_model=None, pad=5):
    """Initialize the pocket finder.

    Parameters
    ----------
    scoring_model: `dc.models.Model`, optional
      If specified, use this model to prune pockets.
    pad: float, optional
      The number of angstroms to pad around a binding pocket's atoms
      to get a binding pocket box.
    """
    self.scoring_model = scoring_model
    self.pad = pad

  def find_all_pockets(self, protein_file):
    """Find list of binding pockets on protein."""
    # protein_coords is (N, 3) tensor
    coords = rdkit_util.load_molecule(protein_file)[0]
    return get_all_boxes(coords, self.pad)

  def find_pockets(self, protein_file, ligand_file):
    """Find list of suitable binding pockets on protein."""
    protein_coords = rdkit_util.load_molecule(
        protein_file, add_hydrogens=False, calc_charges=False)[0]
    ligand_coords = rdkit_util.load_molecule(
        ligand_file, add_hydrogens=False, calc_charges=False)[0]
    boxes = get_all_boxes(protein_coords, self.pad)
    mapping = boxes_to_atoms(protein_coords, boxes)
    pockets, pocket_atoms_map = merge_overlapping_boxes(mapping, boxes)
    pocket_coords = []
    for pocket in pockets:
      atoms = pocket_atoms_map[pocket]
      coords = np.zeros((len(atoms), 3))
      for ind, atom in enumerate(atoms):
        coords[ind] = protein_coords[atom]
      pocket_coords.append(coords)
    return pockets, pocket_atoms_map, pocket_coords


class RFConvexHullPocketFinder(BindingPocketFinder):
  """Uses pre-trained RF model + ConvexHulPocketFinder to select pockets."""
    """Find list of binding pockets on protein.
    
  def __init__(self, pad=5):
    self.pad = pad
    self.convex_finder = ConvexHullPocketFinder(pad)

    # Load binding pocket model
    self.base_dir = tempfile.mkdtemp()
    logger.info("About to download trained model.")
    # TODO(rbharath): Shift refined to full once trained.
    call((
        "wget -nv -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/pocket_random_refined_RF.tar.gz"
    ).split())
    call(("tar -zxvf pocket_random_refined_RF.tar.gz").split())
    call(("mv pocket_random_refined_RF %s" % (self.base_dir)).split())
    self.model_dir = os.path.join(self.base_dir, "pocket_random_refined_RF")
    Parameters
    ----------
    protein_file: str
      Protein to load in.
    """
    coords, _ = rdkit_util.load_molecule(protein_file)
    return box_utils.get_face_boxes(coords, self.pad)

    # Fit model on dataset
    self.model = SklearnModel(model_dir=self.model_dir)
    self.model.reload()
  def find_pockets(self, macromolecule_file):
    """Find list of suitable binding pockets on protein.

    # Create featurizers
    self.pocket_featurizer = BindingPocketFeaturizer()
    self.ligand_featurizer = CircularFingerprint(size=1024)
    This function computes putative binding pockets on this protein.
    This class uses the `ConvexHull` to compute binding pockets. Each
    face of the hull is converted into a coordinate box used for
    binding.

  def find_pockets(self, protein_file, ligand_file):
    """Compute features for a given complex
    Params
    ------
    macromolecule_file: str
      Location of the macromolecule file to load

    TODO(rbharath): This has a log of code overlap with
    compute_binding_pocket_features in
    examples/binding_pockets/binding_pocket_datasets.py. Find way to refactor
    to avoid code duplication.
    Returns
    -------
    List of pockets. Each pocket is a `CoordinateBox`
    """
    # if not ligand_file.endswith(".sdf"):
    #   raise ValueError("Only .sdf ligand files can be featurized.")
    # ligand_basename = os.path.basename(ligand_file).split(".")[0]
    # ligand_mol2 = os.path.join(
    #     self.base_dir, ligand_basename + ".mol2")
    #
    # # Write mol2 file for ligand
    # obConversion = ob.OBConversion()
    # conv_out = obConversion.SetInAndOutFormats(str("sdf"), str("mol2"))
    # ob_mol = ob.OBMol()
    # obConversion.ReadFile(ob_mol, str(ligand_file))
    # obConversion.WriteFile(ob_mol, str(ligand_mol2))
    #
    # # Featurize ligand
    # mol = Chem.MolFromMol2File(str(ligand_mol2), removeHs=False)
    # if mol is None:
    #   return None, None
    # # Default for CircularFingerprint
    # n_ligand_features = 1024
    # ligand_features = self.ligand_featurizer.featurize([mol])
    #
    # # Featurize pocket
    # pockets, pocket_atoms_map, pocket_coords = self.convex_finder.find_pockets(
    #     protein_file, ligand_file)
    # n_pockets = len(pockets)
    # n_pocket_features = BindingPocketFeaturizer.n_features
    #
    # features = np.zeros((n_pockets, n_pocket_features+n_ligand_features))
    # pocket_features = self.pocket_featurizer.featurize(
    #     protein_file, pockets, pocket_atoms_map, pocket_coords)
    # # Note broadcast operation
    # features[:, :n_pocket_features] = pocket_features
    # features[:, n_pocket_features:] = ligand_features
    # dataset = NumpyDataset(X=features)
    # pocket_preds = self.model.predict(dataset)
    # pocket_pred_proba = np.squeeze(self.model.predict_proba(dataset))
    #
    # # Find pockets which are active
    # active_pockets = []
    # active_pocket_atoms_map = {}
    # active_pocket_coords = []
    # for pocket_ind in range(len(pockets)):
    #   #################################################### DEBUG
    #   # TODO(rbharath): For now, using a weak cutoff. Fix later.
    #   #if pocket_preds[pocket_ind] == 1:
    #   if pocket_pred_proba[pocket_ind][1] > .15:
    #   #################################################### DEBUG
    #     pocket = pockets[pocket_ind]
    #     active_pockets.append(pocket)
    #     active_pocket_atoms_map[pocket] = pocket_atoms_map[pocket]
    #     active_pocket_coords.append(pocket_coords[pocket_ind])
    # return active_pockets, active_pocket_atoms_map, active_pocket_coords
    # # TODO(LESWING)
    raise ValueError("Karl Implement")
    coords = rdkit_util.load_molecule(
        macromolecule_file, add_hydrogens=False, calc_charges=False)[0]
    boxes = box_utils.get_face_boxes(coords, self.pad)
    boxes = box_utils.merge_overlapping_boxes(boxes)
    return boxes
+72 −102
Original line number Diff line number Diff line
"""
Docks protein-ligand pairs
Docks Molecular Complexes 
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import logging
import numpy as np
import os
import tempfile
from deepchem.data import DiskDataset
from deepchem.models import SklearnModel
from deepchem.models import MultitaskRegressor
from deepchem.dock.pose_scoring import GridPoseScorer
from deepchem.dock.pose_generation import VinaPoseGenerator
from sklearn.ensemble import RandomForestRegressor
from subprocess import call

logger = logging.getLogger(__name__)


class Docker(object):
  """Abstract Class specifying API for Docking."""

  def dock(self,
           protein_file,
           ligand_file,
           centroid=None,
           box_dims=None,
           dry_run=False):
    raise NotImplementedError

  """A generic molecular docking class

class VinaGridRFDocker(Docker):
  """Vina pose-generation, RF-models on grid-featurization of complexes."""
  This class provides a docking engine which uses provided models for
  featurization, pose generation, and scoring. Most pieces of docking
  software are command line tools that are invoked from the shell. The
  goal of this class is to provide a python clean API for invoking
  molecular docking programmatically.

  def __init__(self, exhaustiveness=10, detect_pockets=False):
    """Builds model."""
    self.base_dir = tempfile.mkdtemp()
    logger.info("About to download trained model.")
    call((
        "wget -nv -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/random_full_RF.tar.gz"
    ).split())
    call(("tar -zxvf random_full_RF.tar.gz").split())
    call(("mv random_full_RF %s" % (self.base_dir)).split())
    self.model_dir = os.path.join(self.base_dir, "random_full_RF")

    # Fit model on dataset
    model = SklearnModel(model_dir=self.model_dir)
    model.reload()

    self.pose_scorer = GridPoseScorer(model, feat="grid")
    self.pose_generator = VinaPoseGenerator(
        exhaustiveness=exhaustiveness, detect_pockets=detect_pockets)

  def dock(self,
           protein_file,
           ligand_file,
           centroid=None,
           box_dims=None,
           dry_run=False):
    """Docks using Vina and RF."""
    protein_docked, ligand_docked = self.pose_generator.generate_poses(
        protein_file, ligand_file, centroid, box_dims, dry_run)
    if not dry_run:
      score = self.pose_scorer.score(protein_docked, ligand_docked)
    else:
      score = np.zeros((1,))
    return (score, (protein_docked, ligand_docked))


'''
class VinaGridDNNDocker(object):
  """Vina pose-generation, DNN-models on grid-featurization of complexes."""
  The implementation of this class is lightweight and generic. It's
  expected that the majority of the heavy lifting will be done by pose
  generation and scoring classes that are provided to this class.
  """

  def __init__(self, exhaustiveness=10, detect_pockets=False):
    """Builds model."""
  def __init__(self,
               pose_generator,
               featurizer=None,
               scoring_model=None):
    """Builds model.

    Parameters
    ----------
    pose_generator: `PoseGenerator`
      The pose generator to use for this model
    featurizer: `ComplexFeaturizer`
      Featurizer associated with `scoring_model`
    scoring_model: `Model`
      Should make predictions on molecular complex.
    """
    self.base_dir = tempfile.mkdtemp()
    logger.info("About to download trained model.")
    call((
        "wget -nv -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/random_full_DNN.tar.gz"
    ).split())
    call(("tar -zxvf random_full_DNN.tar.gz").split())
    call(("mv random_full_DNN %s" % (self.base_dir)).split())
    self.model_dir = os.path.join(self.base_dir, "random_full_DNN")

    # Fit model on dataset
    pdbbind_tasks = ["-logKd/Ki"]
    n_features = 2052
    model = MultitaskRegressor(
        len(pdbbind_tasks),
        n_features,
        dropouts=[.25],
        learning_rate=0.0003,
        weight_init_stddevs=[.1],
        batch_size=64,
        model_dir=self.model_dir)
    model.reload()

    self.pose_scorer = GridPoseScorer(model, feat="grid")
    self.pose_generator = VinaPoseGenerator(
        exhaustiveness=exhaustiveness, detect_pockets=detect_pockets)
    self.pose_generator = pose_generator
    self.featurizer = featurizer
    self.scoring_model = scoring_model

  def dock(self,
           protein_file,
           ligand_file,
           molecular_complex,
           centroid=None,
           box_dims=None,
           dry_run=False):
    """Docks using Vina and DNNs."""
    protein_docked, ligand_docked = self.pose_generator.generate_poses(
        protein_file, ligand_file, centroid, box_dims, dry_run)
    if not dry_run:
      score = self.pose_scorer.score(protein_docked, ligand_docked)
           exhaustiveness=10,
           num_modes=9,
           num_pockets=None,
           out_dir=None):
    """Docks using Vina and RF.

    Parameters
    ----------
    molecular_complex: Object
      Some representation of a molecular complex.
    exhaustiveness: int, optional (default 10)
      Tells Autodock Vina how exhaustive it should be with pose
      generation.
    num_modes: int, optional (default 9)
      Tells Autodock Vina how many binding modes it should generate at
      each invocation.
    num_pockets: int, optional (default None)
      If specified, `self.pocket_finder` must be set. Will only
      generate poses for the first `num_pockets` returned by
      `self.pocket_finder`.
    out_dir: str, optional
      If specified, write generated poses to this directory.
    """
    complexes = self.pose_generator.generate_poses(molecular_complex,
                                                   centroid=centroid,
                                                   box_dims=box_dims,
                                                   exhaustiveness=exhaustiveness,
                                                   num_modes=num_modes,
                                                   num_pockets=num_pockets,
                                                   out_dir=out_dir)
    for posed_complex in complexes:
      if self.featurizer is not None:
        # TODO: How to handle the failure here?
        features, _ = self.featurizer.featurize_complexes([molecular_complex])
        dataset = NumpyDataset(X=features)
        score = self.model.predict(dataset)
        yield (score, posed_complex)
      else:
      score = np.zeros((1,))
    return (score, (protein_docked, ligand_docked))
'''
        yield posed_complex
+208 −101

File changed.

Preview size limit exceeded, changes collapsed.

+185 −46

File changed.

Preview size limit exceeded, changes collapsed.

+11 −128

File changed.

Preview size limit exceeded, changes collapsed.

Loading