Merge branch 'master' of https://github.com/deepchem/deepchem into nci_ref (767e32f2) · Commits · 钟慕尧 / deepchem

README.md

+9 −5

Original line number	Diff line number	Diff line
		@@ -251,9 +251,12 @@ Scaffold splitting
		\| \|graphconv regression\|Random \|0.996 \|0.873 \|
		\| \|MT-NN regression \|Scaffold \|0.782 \|0.426 \|
		\| \|graphconv regression\|Scaffold \|0.994 \|0.606 \|
		\|nci \|MT-NN regression \|Index \|0.890 \|0.890 \|
		\| \|MT-NN regression \|Random \|0.891 \|0.888 \|
		\| \|MT-NN regression \|Scaffold \|0.912 \|0.020 \|
		\|nci \|MT-NN regression \|Index \|0.171 \|0.062 \|
		\| \|graphconv regression\|Index \|0.123 \|0.048 \|
		\| \|MT-NN regression \|Random \|0.168 \|0.085 \|
		\| \|graphconv regression\|Random \|0.117 \|0.076 \|
		\| \|MT-NN regression \|Scaffold \|0.180 \|0.052 \|
		\| \|graphconv regression\|Scaffold \|0.131 \|0.046 \|
		\|kaggle \|MT-NN regression \|User-defined\|0.748 \|0.452 \|

		* General features
		@@ -269,7 +272,7 @@ Number of tasks and examples in the datasets
		\|toxcast \|617 \|8615 \|
		\|delaney \|1 \|1128 \|
		\|kaggle \|15 \|173065 \|
		\|nci \|60 \|1057371 \|
		\|nci \|60 \|19127 \|

		Time needed for benchmark test(~20h in total)

		@@ -297,7 +300,8 @@ Time needed for benchmark test(~20h in total)
		\| \|graph convolution \|80 \|900 \|
		\|delaney \|MT-NN regression \|10 \|40 \|
		\| \|graphconv regression\|10 \|40 \|
		\|nci \|MT-NN regression \|2000 \|30000 \|
		\|nci \|MT-NN regression \|400 \|1200 \|
		\| \|graphconv regression\|400 \|2500 \|
		\|kaggle \|MT-NN regression \|2200 \|3200 \|

deepchem/data/datasets.py

+9 −4

Original line number	Diff line number	Diff line
		@@ -225,13 +225,18 @@ class Dataset(object):
		class NumpyDataset(Dataset):
		"""A Dataset defined by in-memory numpy arrays."""

		def __init__(self, X, y, w=None, ids=None):
		def __init__(self, X, y=None, w=None, ids=None):
		n_samples = len(X)
		# The -1 indicates that y will be reshaped to have length -1
		if n_samples > 0:
		if y is not None:
		y = np.reshape(y, (n_samples, -1))
		if w is not None:
		w = np.reshape(w, (n_samples, -1))
		else:
		# Set labels to be zero, with zero weights
		y = np.zeros((n_samples, 1))
		w = np.zeros_like(y)
		n_tasks = y.shape[1]
		if ids is None:
		ids = np.arange(n_samples)

deepchem/dock/init.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -13,3 +13,4 @@ from deepchem.dock.docking import Docker
		from deepchem.dock.docking import VinaGridRFDocker
		from deepchem.dock.docking import VinaGridDNNDocker
		from deepchem.dock.binding_pocket import ConvexHullPocketFinder
		from deepchem.dock.binding_pocket import RFConvexHullPocketFinder

deepchem/dock/binding_pocket.py

+108 −11

Original line number	Diff line number	Diff line
		@@ -9,20 +9,25 @@ __author__ = "Bharath Ramsundar"
		__copyright__ = "Copyright 2017, Stanford University"
		__license__ = "GPL"

		import numpy as np
		import os
		import pybel
		import tempfile
		import numpy as np
		import openbabel as ob
		from rdkit import Chem
		from subprocess import call
		from scipy.spatial import ConvexHull
		from deepchem.feat import hydrogenate_and_compute_partial_charges
		from deepchem.feat.atomic_coordinates import AtomicCoordinates
		from deepchem.feat.grid_featurizer import load_molecule
		from subprocess import call
		from deepchem.feat.binding_pocket_features import BindingPocketFeaturizer
		from deepchem.feat.fingerprints import CircularFingerprint
		from deepchem.models.sklearn_models import SklearnModel
		from deepchem.data.datasets import NumpyDataset

		def extract_active_site(protein_file, ligand_file, cutoff=4):
		"""Extracts a box for the active site."""
		protein_coords = load_molecule(protein_file)[0]
		ligand_coords = load_molecule(ligand_file)[0]
		protein_coords = load_molecule(protein_file, add_hydrogens=False)[0]
		ligand_coords = load_molecule(ligand_file, add_hydrogens=False)[0]
		num_ligand_atoms = len(ligand_coords)
		num_protein_atoms = len(protein_coords)
		pocket_inds = []
		@@ -164,7 +169,7 @@ def merge_overlapping_boxes(mapping, boxes, threshold=.8):
		class BindingPocketFinder(object):
		"""Abstract superclass for binding pocket detectors"""

		def find_pockets(self, protein_file):
		def find_pockets(self, protein_file, ligand_file):
		"""Finds potential binding pockets in proteins."""
		raise NotImplementedError

		@@ -179,14 +184,106 @@ class ConvexHullPocketFinder(BindingPocketFinder):
		def find_all_pockets(self, protein_file):
		"""Find list of binding pockets on protein."""
		# protein_coords is (N, 3) tensor
		coords = load_molecule(protein_file)[0]
		coords = load_molecule(protein_file, add_hydrogens=False)[0]
		return get_all_boxes(coords, self.pad)

		def find_pockets(self, protein_file, ligand_file):
		"""Find list of suitable binding pockets on protein."""
		protein_coords = load_molecule(protein_file)[0]
		ligand_coords = load_molecule(ligand_file)[0]
		protein_coords = load_molecule(protein_file, add_hydrogens=False)[0]
		ligand_coords = load_molecule(ligand_file, add_hydrogens=False)[0]
		boxes = get_all_boxes(protein_coords, self.pad)
		mapping = boxes_to_atoms(protein_coords, boxes)
		merged_boxes, mapping = merge_overlapping_boxes(mapping, boxes)
		return merged_boxes, mapping
		pockets, pocket_atoms_map = merge_overlapping_boxes(mapping, boxes)
		pocket_coords = []
		for pocket in pockets:
		atoms = pocket_atoms_map[pocket]
		coords = np.zeros((len(atoms), 3))
		for ind, atom in enumerate(atoms):
		coords[ind] = protein_coords[atom]
		pocket_coords.append(coords)
		return pockets, pocket_atoms_map, pocket_coords

		class RFConvexHullPocketFinder(BindingPocketFinder):
		"""Uses pre-trained RF model + ConvexHulPocketFinder to select pockets."""

		def __init__(self, pad=5):
		self.pad = pad
		self.convex_finder = ConvexHullPocketFinder(pad)

		# Load binding pocket model
		self.base_dir = tempfile.mkdtemp()
		print("About to download trained model.")
		# TODO(rbharath): Shift refined to full once trained.
		call(("wget -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/pocket_random_refined_RF.tar.gz").split())
		call(("tar -zxvf pocket_random_refined_RF.tar.gz").split())
		call(("mv pocket_random_refined_RF %s" % (self.base_dir)).split())
		self.model_dir = os.path.join(self.base_dir, "pocket_random_refined_RF")

		# Fit model on dataset
		self.model = SklearnModel(model_dir=self.model_dir)
		self.model.reload()

		# Create featurizers
		self.pocket_featurizer = BindingPocketFeaturizer()
		self.ligand_featurizer = CircularFingerprint(size=1024)

		def find_pockets(self, protein_file, ligand_file):
		"""Compute features for a given complex

		TODO(rbharath): This has a log of code overlap with
		compute_binding_pocket_features in
		examples/binding_pockets/binding_pocket_datasets.py. Find way to refactor
		to avoid code duplication.
		"""
		if not ligand_file.endswith(".sdf"):
		raise ValueError("Only .sdf ligand files can be featurized.")
		ligand_basename = os.path.basename(ligand_file).split(".")[0]
		ligand_mol2 = os.path.join(
		self.base_dir, ligand_basename + ".mol2")

		# Write mol2 file for ligand
		obConversion = ob.OBConversion()
		conv_out = obConversion.SetInAndOutFormats(str("sdf"), str("mol2"))
		ob_mol = ob.OBMol()
		obConversion.ReadFile(ob_mol, str(ligand_file))
		obConversion.WriteFile(ob_mol, str(ligand_mol2))

		# Featurize ligand
		mol = Chem.MolFromMol2File(str(ligand_mol2), removeHs=False)
		if mol is None:
		return None, None
		# Default for CircularFingerprint
		n_ligand_features = 1024
		ligand_features = self.ligand_featurizer.featurize([mol])

		# Featurize pocket
		pockets, pocket_atoms_map, pocket_coords = self.convex_finder.find_pockets(
		protein_file, ligand_file)
		n_pockets = len(pockets)
		n_pocket_features = BindingPocketFeaturizer.n_features

		features = np.zeros((n_pockets, n_pocket_features+n_ligand_features))
		pocket_features = self.pocket_featurizer.featurize(
		protein_file, pockets, pocket_atoms_map, pocket_coords)
		# Note broadcast operation
		features[:, :n_pocket_features] = pocket_features
		features[:, n_pocket_features:] = ligand_features
		dataset = NumpyDataset(X=features)
		pocket_preds = self.model.predict(dataset)
		pocket_pred_proba = np.squeeze(self.model.predict_proba(dataset))

		# Find pockets which are active
		active_pockets = []
		active_pocket_atoms_map = {}
		active_pocket_coords = []
		for pocket_ind in range(len(pockets)):
		#################################################### DEBUG
		# TODO(rbharath): For now, using a weak cutoff. Fix later.
		#if pocket_preds[pocket_ind] == 1:
		if pocket_pred_proba[pocket_ind][1] > .15:
		#################################################### DEBUG
		pocket = pockets[pocket_ind]
		active_pockets.append(pocket)
		active_pocket_atoms_map[pocket] = pocket_atoms_map[pocket]
		active_pocket_coords.append(pocket_coords[pocket_ind])
		return active_pockets, active_pocket_atoms_map, active_pocket_coords

deepchem/dock/docking.py

+6 −4

Original line number	Diff line number	Diff line
		@@ -30,7 +30,7 @@ class Docker(object):
		class VinaGridRFDocker(Docker):
		"""Vina pose-generation, RF-models on grid-featurization of complexes."""

		def __init__(self):
		def __init__(self, exhaustiveness=10, detect_pockets=False):
		"""Builds model."""
		self.base_dir = tempfile.mkdtemp()
		print("About to download trained model.")
		@@ -44,7 +44,8 @@ class VinaGridRFDocker(Docker):
		model.reload()

		self.pose_scorer = GridPoseScorer(model, feat="grid")
		self.pose_generator = VinaPoseGenerator()
		self.pose_generator = VinaPoseGenerator(
		exhaustiveness=exhaustiveness, detect_pockets=detect_pockets)

		def dock(self, protein_file, ligand_file):
		"""Docks using Vina and RF."""
		@@ -56,7 +57,7 @@ class VinaGridRFDocker(Docker):
		class VinaGridDNNDocker(object):
		"""Vina pose-generation, DNN-models on grid-featurization of complexes."""

		def __init__(self, n_trees=100):
		def __init__(self, exhaustiveness=10, detect_pockets=False):
		"""Builds model."""
		self.base_dir = tempfile.mkdtemp()
		print("About to download trained model.")
		@@ -74,7 +75,8 @@ class VinaGridDNNDocker(object):
		model.reload()

		self.pose_scorer = GridPoseScorer(model, feat="grid")
		self.pose_generator = VinaPoseGenerator()
		self.pose_generator = VinaPoseGenerator(
		exhaustiveness=exhaustiveness, detect_pockets=detect_pockets)

		def dock(self, protein_file, ligand_file):
		"""Docks using Vina and DNNs."""

Original line number	Diff line number	Diff line
		@@ -251,9 +251,12 @@ Scaffold splitting
		\| \|graphconv regression\|Random \|0.996 \|0.873 \|
		\| \|MT-NN regression \|Scaffold \|0.782 \|0.426 \|
		\| \|graphconv regression\|Scaffold \|0.994 \|0.606 \|
		\|nci \|MT-NN regression \|Index \|0.890 \|0.890 \|
		\| \|MT-NN regression \|Random \|0.891 \|0.888 \|
		\| \|MT-NN regression \|Scaffold \|0.912 \|0.020 \|
		\|nci \|MT-NN regression \|Index \|0.171 \|0.062 \|
		\| \|graphconv regression\|Index \|0.123 \|0.048 \|
		\| \|MT-NN regression \|Random \|0.168 \|0.085 \|
		\| \|graphconv regression\|Random \|0.117 \|0.076 \|
		\| \|MT-NN regression \|Scaffold \|0.180 \|0.052 \|
		\| \|graphconv regression\|Scaffold \|0.131 \|0.046 \|
		\|kaggle \|MT-NN regression \|User-defined\|0.748 \|0.452 \|

		* General features
		@@ -269,7 +272,7 @@ Number of tasks and examples in the datasets
		\|toxcast \|617 \|8615 \|
		\|delaney \|1 \|1128 \|
		\|kaggle \|15 \|173065 \|
		\|nci \|60 \|1057371 \|
		\|nci \|60 \|19127 \|

		Time needed for benchmark test(~20h in total)

		@@ -297,7 +300,8 @@ Time needed for benchmark test(~20h in total)
		\| \|graph convolution \|80 \|900 \|
		\|delaney \|MT-NN regression \|10 \|40 \|
		\| \|graphconv regression\|10 \|40 \|
		\|nci \|MT-NN regression \|2000 \|30000 \|
		\|nci \|MT-NN regression \|400 \|1200 \|
		\| \|graphconv regression\|400 \|2500 \|
		\|kaggle \|MT-NN regression \|2200 \|3200 \|

Admin message