Merge remote-tracking branch 'upstream/master' (48704122) · Commits · 钟慕尧 / deepchem

deepchem/dock/init.py

+0 −4

Original line number	Diff line number	Diff line
		@@ -3,9 +3,5 @@ Imports all submodules
		"""
		from deepchem.dock.pose_generation import PoseGenerator
		from deepchem.dock.pose_generation import VinaPoseGenerator
		from deepchem.dock.pose_scoring import PoseScorer
		from deepchem.dock.pose_scoring import GridPoseScorer
		from deepchem.dock.docking import Docker
		from deepchem.dock.docking import VinaGridRFDocker
		from deepchem.dock.binding_pocket import ConvexHullPocketFinder
		from deepchem.dock.binding_pocket import RFConvexHullPocketFinder

deepchem/dock/binding_pocket.py

+87 −256

Original line number	Diff line number	Diff line
		"""
		Computes putative binding pockets on protein.
		"""
		__author__ = "Bharath Ramsundar"
		__copyright__ = "Copyright 2017, Stanford University"
		__license__ = "MIT"

		import os
		import logging
		import tempfile
		import numpy as np
		from subprocess import call
		from scipy.spatial import ConvexHull
		from deepchem.feat.binding_pocket_features import BindingPocketFeaturizer
		from deepchem.feat.fingerprints import CircularFingerprint
		from deepchem.models.sklearn_models import SklearnModel
		from deepchem.utils import rdkit_util
		from deepchem.utils import coordinate_box_utils as box_utils
		from deepchem.utils.fragment_util import get_contact_atom_indices

		logger = logging.getLogger(__name__)


		def extract_active_site(protein_file, ligand_file, cutoff=4):
		"""Extracts a box for the active site."""
		protein_coords = rdkit_util.load_molecule(
		protein_file, add_hydrogens=False)[0]
		ligand_coords = rdkit_util.load_molecule(
		ligand_file, add_hydrogens=True, calc_charges=True)[0]
		num_ligand_atoms = len(ligand_coords)
		num_protein_atoms = len(protein_coords)
		pocket_inds = []
		pocket_atoms = set([])
		for lig_atom_ind in range(num_ligand_atoms):
		lig_atom = ligand_coords[lig_atom_ind]
		for protein_atom_ind in range(num_protein_atoms):
		protein_atom = protein_coords[protein_atom_ind]
		if np.linalg.norm(lig_atom - protein_atom) < cutoff:
		if protein_atom_ind not in pocket_atoms:
		pocket_atoms = pocket_atoms.union(set([protein_atom_ind]))
		# Should be an array of size (n_pocket_atoms, 3)
		pocket_atoms = list(pocket_atoms)
		n_pocket_atoms = len(pocket_atoms)
		pocket_coords = np.zeros((n_pocket_atoms, 3))
		for ind, pocket_ind in enumerate(pocket_atoms):
		pocket_coords[ind] = protein_coords[pocket_ind]
		"""Extracts a box for the active site.

		Params
		------
		protein_file: str
		Location of protein PDB
		ligand_file: str
		Location of ligand input file
		cutoff: int, optional
		The distance in angstroms from the protein pocket to
		consider for featurization.

		Returns
		-------
		A tuple of `(CoordinateBox, np.ndarray)` where the second entry is
		of shape `(N, 3)` with `N` the number of atoms in the active site.
		"""
		protein = rdkit_util.load_molecule(protein_file, add_hydrogens=False)
		ligand = rdkit_util.load_molecule(
		ligand_file, add_hydrogens=True, calc_charges=True)
		protein_contacts, ligand_contacts = get_contact_atom_indices(
		[protein, ligand], cutoff=cutoff)
		protein_coords = protein[0]
		pocket_coords = protein_coords[protein_contacts]

		x_min = int(np.floor(np.amin(pocket_coords[:, 0])))
		x_max = int(np.ceil(np.amax(pocket_coords[:, 0])))
		@@ -49,131 +47,33 @@ def extract_active_site(protein_file, ligand_file, cutoff=4):
		y_max = int(np.ceil(np.amax(pocket_coords[:, 1])))
		z_min = int(np.floor(np.amin(pocket_coords[:, 2])))
		z_max = int(np.ceil(np.amax(pocket_coords[:, 2])))
		return (((x_min, x_max), (y_min, y_max), (z_min, z_max)), pocket_atoms,
		pocket_coords)


		def compute_overlap(mapping, box1, box2):
		"""Computes overlap between the two boxes.

		Overlap is defined as % atoms of box1 in box2. Note that
		overlap is not a symmetric measurement.
		"""
		atom1 = set(mapping[box1])
		atom2 = set(mapping[box2])
		return len(atom1.intersection(atom2)) / float(len(atom1))


		def get_all_boxes(coords, pad=5):
		"""Get all pocket boxes for protein coords.
		box = box_utils.CoordinateBox((x_min, x_max), (y_min, y_max), (z_min, z_max))
		return (box, pocket_coords)

		We pad all boxes the prescribed number of angstroms.

		TODO(rbharath): It looks like this may perhaps be non-deterministic?
		"""
		hull = ConvexHull(coords)
		boxes = []
		for triangle in hull.simplices:
		# coords[triangle, 0] gives the x-dimension of all triangle points
		# Take transpose to make sure rows correspond to atoms.
		points = np.array(
		[coords[triangle, 0], coords[triangle, 1], coords[triangle, 2]]).T
		# We voxelize so all grids have integral coordinates (convenience)
		x_min, x_max = np.amin(points[:, 0]), np.amax(points[:, 0])
		x_min, x_max = int(np.floor(x_min)) - pad, int(np.ceil(x_max)) + pad
		y_min, y_max = np.amin(points[:, 1]), np.amax(points[:, 1])
		y_min, y_max = int(np.floor(y_min)) - pad, int(np.ceil(y_max)) + pad
		z_min, z_max = np.amin(points[:, 2]), np.amax(points[:, 2])
		z_min, z_max = int(np.floor(z_min)) - pad, int(np.ceil(z_max)) + pad
		boxes.append(((x_min, x_max), (y_min, y_max), (z_min, z_max)))
		return boxes


		def boxes_to_atoms(atom_coords, boxes):
		"""Maps each box to a list of atoms in that box.

		TODO(rbharath): This does a num_atoms x num_boxes computations. Is
		there a reasonable heuristic we can use to speed this up?
		class BindingPocketFinder(object):
		"""Abstract superclass for binding pocket detectors

		Many times when working with a new protein or other macromolecule,
		it's not clear what zones of the macromolecule may be good targets
		for potential ligands or other molecules to interact with. This
		abstract class provides a template for child classes that
		algorithmically locate potential binding pockets that are good
		potential interaction sites.

		Note that potential interactions sites can be found by many
		different methods, and that this abstract class doesn't specify the
		technique to be used.
		"""
		mapping = {}
		for box_ind, box in enumerate(boxes):
		box_atoms = []
		(x_min, x_max), (y_min, y_max), (z_min, z_max) = box
		logger.info("Handing box %d/%d" % (box_ind, len(boxes)))
		for atom_ind in range(len(atom_coords)):
		atom = atom_coords[atom_ind]
		x_cont = x_min <= atom[0] and atom[0] <= x_max
		y_cont = y_min <= atom[1] and atom[1] <= y_max
		z_cont = z_min <= atom[2] and atom[2] <= z_max
		if x_cont and y_cont and z_cont:
		box_atoms.append(atom_ind)
		mapping[box] = box_atoms
		return mapping


		def merge_boxes(box1, box2):
		"""Merges two boxes."""
		(x_min1, x_max1), (y_min1, y_max1), (z_min1, z_max1) = box1
		(x_min2, x_max2), (y_min2, y_max2), (z_min2, z_max2) = box2
		x_min = min(x_min1, x_min2)
		y_min = min(y_min1, y_min2)
		z_min = min(z_min1, z_min2)
		x_max = max(x_max1, x_max2)
		y_max = max(y_max1, y_max2)
		z_max = max(z_max1, z_max2)
		return ((x_min, x_max), (y_min, y_max), (z_min, z_max))
		def find_pockets(self, molecule):
		"""Finds potential binding pockets in proteins.


		def merge_overlapping_boxes(mapping, boxes, threshold=.8):
		"""Merge boxes which have an overlap greater than threshold.

		TODO(rbharath): This merge code is terribly inelegant. It's also quadratic
		in number of boxes. It feels like there ought to be an elegant divide and
		conquer approach here. Figure out later...
		Parameters
		----------
		molecule: object
		Some representation of a molecule.
		"""
		num_boxes = len(boxes)
		outputs = []
		for i in range(num_boxes):
		box = boxes[0]
		new_boxes = []
		new_mapping = {}
		# If overlap of box with previously generated output boxes, return
		contained = False
		for output_box in outputs:
		# Carry forward mappings
		new_mapping[output_box] = mapping[output_box]
		if compute_overlap(mapping, box, output_box) == 1:
		contained = True
		if contained:
		continue
		# We know that box has at least one atom not in outputs
		unique_box = True
		for merge_box in boxes[1:]:
		overlap = compute_overlap(mapping, box, merge_box)
		if overlap < threshold:
		new_boxes.append(merge_box)
		new_mapping[merge_box] = mapping[merge_box]
		else:
		# Current box has been merged into box further down list.
		# No need to output current box
		unique_box = False
		merged = merge_boxes(box, merge_box)
		new_boxes.append(merged)
		new_mapping[merged] = list(
		set(mapping[box]).union(set(mapping[merge_box])))
		if unique_box:
		outputs.append(box)
		new_mapping[box] = mapping[box]
		boxes = new_boxes
		mapping = new_mapping
		return outputs, mapping


		class BindingPocketFinder(object):
		"""Abstract superclass for binding pocket detectors"""

		def find_pockets(self, protein_file, ligand_file):
		"""Finds potential binding pockets in proteins."""
		raise NotImplementedError


		@@ -183,119 +83,50 @@ class ConvexHullPocketFinder(BindingPocketFinder):
		Based on https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4112621/pdf/1472-6807-14-18.pdf
		"""

		def __init__(self, pad=5):
		def __init__(self, scoring_model=None, pad=5):
		"""Initialize the pocket finder.

		Parameters
		----------
		scoring_model: `dc.models.Model`, optional
		If specified, use this model to prune pockets.
		pad: float, optional
		The number of angstroms to pad around a binding pocket's atoms
		to get a binding pocket box.
		"""
		self.scoring_model = scoring_model
		self.pad = pad

		def find_all_pockets(self, protein_file):
		"""Find list of binding pockets on protein."""
		# protein_coords is (N, 3) tensor
		coords = rdkit_util.load_molecule(protein_file)[0]
		return get_all_boxes(coords, self.pad)
		"""Find list of binding pockets on protein.

		def find_pockets(self, protein_file, ligand_file):
		"""Find list of suitable binding pockets on protein."""
		protein_coords = rdkit_util.load_molecule(
		protein_file, add_hydrogens=False, calc_charges=False)[0]
		ligand_coords = rdkit_util.load_molecule(
		ligand_file, add_hydrogens=False, calc_charges=False)[0]
		boxes = get_all_boxes(protein_coords, self.pad)
		mapping = boxes_to_atoms(protein_coords, boxes)
		pockets, pocket_atoms_map = merge_overlapping_boxes(mapping, boxes)
		pocket_coords = []
		for pocket in pockets:
		atoms = pocket_atoms_map[pocket]
		coords = np.zeros((len(atoms), 3))
		for ind, atom in enumerate(atoms):
		coords[ind] = protein_coords[atom]
		pocket_coords.append(coords)
		return pockets, pocket_atoms_map, pocket_coords


		class RFConvexHullPocketFinder(BindingPocketFinder):
		"""Uses pre-trained RF model + ConvexHulPocketFinder to select pockets."""

		def __init__(self, pad=5):
		self.pad = pad
		self.convex_finder = ConvexHullPocketFinder(pad)

		# Load binding pocket model
		self.base_dir = tempfile.mkdtemp()
		logger.info("About to download trained model.")
		# TODO(rbharath): Shift refined to full once trained.
		call((
		"wget -nv -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/pocket_random_refined_RF.tar.gz"
		).split())
		call(("tar -zxvf pocket_random_refined_RF.tar.gz").split())
		call(("mv pocket_random_refined_RF %s" % (self.base_dir)).split())
		self.model_dir = os.path.join(self.base_dir, "pocket_random_refined_RF")
		Parameters
		----------
		protein_file: str
		Protein to load in.
		"""
		coords, _ = rdkit_util.load_molecule(protein_file)
		return box_utils.get_face_boxes(coords, self.pad)

		# Fit model on dataset
		self.model = SklearnModel(model_dir=self.model_dir)
		self.model.reload()
		def find_pockets(self, macromolecule_file):
		"""Find list of suitable binding pockets on protein.

		# Create featurizers
		self.pocket_featurizer = BindingPocketFeaturizer()
		self.ligand_featurizer = CircularFingerprint(size=1024)
		This function computes putative binding pockets on this protein.
		This class uses the `ConvexHull` to compute binding pockets. Each
		face of the hull is converted into a coordinate box used for
		binding.

		def find_pockets(self, protein_file, ligand_file):
		"""Compute features for a given complex
		Params
		------
		macromolecule_file: str
		Location of the macromolecule file to load

		TODO(rbharath): This has a log of code overlap with
		compute_binding_pocket_features in
		examples/binding_pockets/binding_pocket_datasets.py. Find way to refactor
		to avoid code duplication.
		Returns
		-------
		List of pockets. Each pocket is a `CoordinateBox`
		"""
		# if not ligand_file.endswith(".sdf"):
		# raise ValueError("Only .sdf ligand files can be featurized.")
		# ligand_basename = os.path.basename(ligand_file).split(".")[0]
		# ligand_mol2 = os.path.join(
		# self.base_dir, ligand_basename + ".mol2")
		#
		# # Write mol2 file for ligand
		# obConversion = ob.OBConversion()
		# conv_out = obConversion.SetInAndOutFormats(str("sdf"), str("mol2"))
		# ob_mol = ob.OBMol()
		# obConversion.ReadFile(ob_mol, str(ligand_file))
		# obConversion.WriteFile(ob_mol, str(ligand_mol2))
		#
		# # Featurize ligand
		# mol = Chem.MolFromMol2File(str(ligand_mol2), removeHs=False)
		# if mol is None:
		# return None, None
		# # Default for CircularFingerprint
		# n_ligand_features = 1024
		# ligand_features = self.ligand_featurizer.featurize([mol])
		#
		# # Featurize pocket
		# pockets, pocket_atoms_map, pocket_coords = self.convex_finder.find_pockets(
		# protein_file, ligand_file)
		# n_pockets = len(pockets)
		# n_pocket_features = BindingPocketFeaturizer.n_features
		#
		# features = np.zeros((n_pockets, n_pocket_features+n_ligand_features))
		# pocket_features = self.pocket_featurizer.featurize(
		# protein_file, pockets, pocket_atoms_map, pocket_coords)
		# # Note broadcast operation
		# features[:, :n_pocket_features] = pocket_features
		# features[:, n_pocket_features:] = ligand_features
		# dataset = NumpyDataset(X=features)
		# pocket_preds = self.model.predict(dataset)
		# pocket_pred_proba = np.squeeze(self.model.predict_proba(dataset))
		#
		# # Find pockets which are active
		# active_pockets = []
		# active_pocket_atoms_map = {}
		# active_pocket_coords = []
		# for pocket_ind in range(len(pockets)):
		# #################################################### DEBUG
		# # TODO(rbharath): For now, using a weak cutoff. Fix later.
		# #if pocket_preds[pocket_ind] == 1:
		# if pocket_pred_proba[pocket_ind][1] > .15:
		# #################################################### DEBUG
		# pocket = pockets[pocket_ind]
		# active_pockets.append(pocket)
		# active_pocket_atoms_map[pocket] = pocket_atoms_map[pocket]
		# active_pocket_coords.append(pocket_coords[pocket_ind])
		# return active_pockets, active_pocket_atoms_map, active_pocket_coords
		# # TODO(LESWING)
		raise ValueError("Karl Implement")
		coords = rdkit_util.load_molecule(
		macromolecule_file, add_hydrogens=False, calc_charges=False)[0]
		boxes = box_utils.get_face_boxes(coords, self.pad)
		boxes = box_utils.merge_overlapping_boxes(boxes)
		return boxes

deepchem/dock/docking.py

+102 −100

Original line number	Diff line number	Diff line
		"""
		Docks protein-ligand pairs
		Docks Molecular Complexes
		"""
		__author__ = "Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "MIT"

		import logging
		import numpy as np
		import os
		import tempfile
		from deepchem.data import DiskDataset
		from deepchem.models import SklearnModel
		from deepchem.models import MultitaskRegressor
		from deepchem.dock.pose_scoring import GridPoseScorer
		from deepchem.dock.pose_generation import VinaPoseGenerator
		from sklearn.ensemble import RandomForestRegressor
		from subprocess import call
		from deepchem.data import NumpyDataset

		logger = logging.getLogger(__name__)


		class Docker(object):
		"""Abstract Class specifying API for Docking."""

		def dock(self,
		protein_file,
		ligand_file,
		centroid=None,
		box_dims=None,
		dry_run=False):
		raise NotImplementedError
		"""A generic molecular docking class

		This class provides a docking engine which uses provided models for
		featurization, pose generation, and scoring. Most pieces of docking
		software are command line tools that are invoked from the shell. The
		goal of this class is to provide a python clean API for invoking
		molecular docking programmatically.

		class VinaGridRFDocker(Docker):
		"""Vina pose-generation, RF-models on grid-featurization of complexes."""
		The implementation of this class is lightweight and generic. It's
		expected that the majority of the heavy lifting will be done by pose
		generation and scoring classes that are provided to this class.
		"""

		def __init__(self, exhaustiveness=10, detect_pockets=False):
		"""Builds model."""
		def __init__(self, pose_generator, featurizer=None, scoring_model=None):
		"""Builds model.

		Parameters
		----------
		pose_generator: `PoseGenerator`
		The pose generator to use for this model
		featurizer: `ComplexFeaturizer`
		Featurizer associated with `scoring_model`
		scoring_model: `Model`
		Should make predictions on molecular complex.
		"""
		if ((featurizer is not None and scoring_model is None) or
		(featurizer is None and scoring_model is not None)):
		raise ValueError(
		"featurizer/scoring_model must both be set or must both be None.")
		self.base_dir = tempfile.mkdtemp()
		logger.info("About to download trained model.")
		call((
		"wget -nv -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/random_full_RF.tar.gz"
		).split())
		call(("tar -zxvf random_full_RF.tar.gz").split())
		call(("mv random_full_RF %s" % (self.base_dir)).split())
		self.model_dir = os.path.join(self.base_dir, "random_full_RF")

		# Fit model on dataset
		model = SklearnModel(model_dir=self.model_dir)
		model.reload()

		self.pose_scorer = GridPoseScorer(model, feat="grid")
		self.pose_generator = VinaPoseGenerator(
		exhaustiveness=exhaustiveness, detect_pockets=detect_pockets)
		self.pose_generator = pose_generator
		self.featurizer = featurizer
		self.scoring_model = scoring_model

		def dock(self,
		protein_file,
		ligand_file,
		molecular_complex,
		centroid=None,
		box_dims=None,
		dry_run=False):
		"""Docks using Vina and RF."""
		protein_docked, ligand_docked = self.pose_generator.generate_poses(
		protein_file, ligand_file, centroid, box_dims, dry_run)
		if not dry_run:
		score = self.pose_scorer.score(protein_docked, ligand_docked)
		exhaustiveness=10,
		num_modes=9,
		num_pockets=None,
		out_dir=None,
		use_pose_generator_scores=False):
		"""Generic docking function.

		This docking function uses this object's featurizer, pose
		generator, and scoring model to make docking predictions. This
		function is written in generic style so

		Parameters
		----------
		molecular_complex: Object
		Some representation of a molecular complex.
		exhaustiveness: int, optional (default 10)
		Tells pose generator how exhaustive it should be with pose
		generation.
		num_modes: int, optional (default 9)
		Tells pose generator how many binding modes it should generate at
		each invocation.
		num_pockets: int, optional (default None)
		If specified, `self.pocket_finder` must be set. Will only
		generate poses for the first `num_pockets` returned by
		`self.pocket_finder`.
		out_dir: str, optional (default None)
		If specified, write generated poses to this directory.
		use_pose_generator_scores: bool, optional (default False)
		If `True`, ask pose generator to generate scores. This cannot be
		`True` if `self.featurizer` and `self.scoring_model` are set
		since those will be used to generate scores in that case.

		Returns
		-------
		A generator. If `use_pose_generator_scores==True` or
		`self.scoring_model` is set, then will yield tuples
		`(posed_complex, score)`. Else will yield `posed_complex`.
		"""
		if self.scoring_model is not None and use_pose_generator_scores:
		raise ValueError(
		"Cannot set use_pose_generator_scores=True when self.scoring_model is set (since both generator scores for complexes)."
		)
		outputs = self.pose_generator.generate_poses(
		molecular_complex,
		centroid=centroid,
		box_dims=box_dims,
		exhaustiveness=exhaustiveness,
		num_modes=num_modes,
		num_pockets=num_pockets,
		out_dir=out_dir,
		generate_scores=use_pose_generator_scores)
		if use_pose_generator_scores:
		complexes, scores = outputs
		else:
		score = np.zeros((1,))
		return (score, (protein_docked, ligand_docked))


		'''
		class VinaGridDNNDocker(object):
		"""Vina pose-generation, DNN-models on grid-featurization of complexes."""

		def __init__(self, exhaustiveness=10, detect_pockets=False):
		"""Builds model."""
		self.base_dir = tempfile.mkdtemp()
		logger.info("About to download trained model.")
		call((
		"wget -nv -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/random_full_DNN.tar.gz"
		).split())
		call(("tar -zxvf random_full_DNN.tar.gz").split())
		call(("mv random_full_DNN %s" % (self.base_dir)).split())
		self.model_dir = os.path.join(self.base_dir, "random_full_DNN")

		# Fit model on dataset
		pdbbind_tasks = ["-logKd/Ki"]
		n_features = 2052
		model = MultitaskRegressor(
		len(pdbbind_tasks),
		n_features,
		dropouts=[.25],
		learning_rate=0.0003,
		weight_init_stddevs=[.1],
		batch_size=64,
		model_dir=self.model_dir)
		model.reload()

		self.pose_scorer = GridPoseScorer(model, feat="grid")
		self.pose_generator = VinaPoseGenerator(
		exhaustiveness=exhaustiveness, detect_pockets=detect_pockets)

		def dock(self,
		protein_file,
		ligand_file,
		centroid=None,
		box_dims=None,
		dry_run=False):
		"""Docks using Vina and DNNs."""
		protein_docked, ligand_docked = self.pose_generator.generate_poses(
		protein_file, ligand_file, centroid, box_dims, dry_run)
		if not dry_run:
		score = self.pose_scorer.score(protein_docked, ligand_docked)
		complexes = outputs
		# We know use_pose_generator_scores == False in this case
		if self.scoring_model is not None:
		for posed_complex in complexes:
		# TODO: How to handle the failure here?
		features, _ = self.featurizer.featurize_complexes([molecular_complex])
		dataset = NumpyDataset(X=features)
		score = self.scoring_model.predict(dataset)
		yield (posed_complex, score)
		elif use_pose_generator_scores:
		for posed_complex, score in zip(complexes, scores):
		yield (posed_complex, score)
		else:
		score = np.zeros((1,))
		return (score, (protein_docked, ligand_docked))
		'''
		for posed_complex in complexes:
		yield posed_complex

Admin message