Merge pull request #1411 from rbharath/ch5_changes (9f7fa857) · Commits · 钟慕尧 / deepchem

deepchem/feat/rdkit_grid_featurizer.py

+22 −0

Original line number	Diff line number	Diff line
		@@ -1313,6 +1313,28 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
		channel_power=None,
		nb_channel=16,
		dtype="np.int8"):
		"""Private helper function to voxelize inputs.

		Parameters
		----------
		get_voxels: function
		Function that voxelizes inputs
		hash_function: function
		Used to map feature choices to voxel channels.
		coordinates: np.ndarray
		Contains the 3D coordinates of a molecular system.
		feature_dict: Dictionary
		Keys are atom indices.
		feature_list: list
		List of available features.
		channel_power: int
		If specified, nb_channel is set to 2**channel_power.
		TODO: This feels like a redundant parameter.
		nb_channel: int
		The number of feature channels computed per voxel
		dtype: type
		The dtype of the numpy ndarray created to hold features.
		"""

		if channel_power is not None:
		if channel_power == 0:

deepchem/models/init.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -27,6 +27,7 @@ from deepchem.models.tensorgraph.models.text_cnn import TextCNNModel
		from deepchem.models.tensorgraph.sequential import Sequential
		from deepchem.models.tensorgraph.models.sequence_dnn import SequenceDNN
		from deepchem.models.tensorgraph.models.ontology import OntologyModel, OntologyNode, create_gene_ontology
		from deepchem.models.tensorgraph.models.atomic_conv import AtomicConvModel

		#################### Compatibility imports for renamed TensorGraph models. Remove below with DeepChem 3.0. ####################

deepchem/models/tensorgraph/tests/test_atomic_conv.py

+75 −0

Original line number	Diff line number	Diff line
		@@ -6,6 +6,7 @@ from __future__ import unicode_literals

		from nose.plugins.attrib import attr

		import os
		import deepchem
		import numpy as np
		import tensorflow as tf
		@@ -14,6 +15,7 @@ import numpy as np
		from deepchem.models.tensorgraph.models import atomic_conv
		from deepchem.models.tensorgraph import layers
		from deepchem.data import NumpyDataset
		from deepchem.feat.atomic_coordinates import ComplexNeighborListFragmentAtomicCoordinates


		class TestAtomicConv(unittest.TestCase):
		@@ -63,3 +65,76 @@ class TestAtomicConv(unittest.TestCase):
		labels = np.zeros(batch_size)
		train = NumpyDataset(features, labels)
		atomic_convnet.fit(train, nb_epoch=1)

		@attr("slow")
		def test_atomic_conv_variable(self):
		"""A simple test that initializes and fits an AtomicConvModel on variable input size."""
		# For simplicity, let's assume both molecules have same number of
		# atoms.
		frag1_num_atoms = 1000
		frag2_num_atoms = 1200
		complex_num_atoms = frag1_num_atoms + frag2_num_atoms
		batch_size = 1
		atomic_convnet = atomic_conv.AtomicConvModel(
		batch_size=batch_size,
		frag1_num_atoms=frag1_num_atoms,
		frag2_num_atoms=frag2_num_atoms,
		complex_num_atoms=complex_num_atoms)

		# Creates a set of dummy features that contain the coordinate and
		# neighbor-list features required by the AtomicConvModel.
		features = []
		frag1_coords = np.random.rand(frag1_num_atoms, 3)
		frag1_nbr_list = {i: [] for i in range(frag1_num_atoms)}
		frag1_z = np.random.randint(10, size=(frag1_num_atoms))
		frag2_coords = np.random.rand(frag2_num_atoms, 3)
		frag2_nbr_list = {i: [] for i in range(frag2_num_atoms)}
		frag2_z = np.random.randint(10, size=(frag2_num_atoms))
		system_coords = np.random.rand(complex_num_atoms, 3)
		system_nbr_list = {i: [] for i in range(complex_num_atoms)}
		system_z = np.random.randint(10, size=(complex_num_atoms))

		features.append(
		(frag1_coords, frag1_nbr_list, frag1_z, frag2_coords, frag2_nbr_list,
		frag2_z, system_coords, system_nbr_list, system_z))
		features = np.asarray(features)
		labels = np.zeros(batch_size)
		train = NumpyDataset(features, labels)
		atomic_convnet.fit(train, nb_epoch=1)

		@attr("slow")
		def test_atomic_conv_with_feat(self):
		"""A simple test for running an atomic convolution on featurized data."""
		dir_path = os.path.dirname(os.path.realpath(__file__))
		ligand_file = os.path.join(dir_path,
		"../../../feat/tests/data/3zso_ligand_hyd.pdb")
		protein_file = os.path.join(dir_path,
		"../../../feat/tests/data/3zso_protein.pdb")
		# Pulled from PDB files. For larger datasets with more PDBs, would use
		# max num atoms instead of exact.
		frag1_num_atoms = 44 # for ligand atoms
		frag2_num_atoms = 2336 # for protein atoms
		complex_num_atoms = 2380 # in total
		max_num_neighbors = 4
		# Cutoff in angstroms
		neighbor_cutoff = 4
		complex_featurizer = ComplexNeighborListFragmentAtomicCoordinates(
		frag1_num_atoms, frag2_num_atoms, complex_num_atoms, max_num_neighbors,
		neighbor_cutoff)
		# arbitrary label
		labels = np.array([0])
		features, _ = complex_featurizer.featurize_complexes([ligand_file],
		[protein_file])
		dataset = deepchem.data.DiskDataset.from_numpy(features, labels)

		batch_size = 1
		print("Constructing Atomic Conv model")
		atomic_convnet = atomic_conv.AtomicConvModel(
		batch_size=batch_size,
		frag1_num_atoms=frag1_num_atoms,
		frag2_num_atoms=frag2_num_atoms,
		complex_num_atoms=complex_num_atoms)

		print("About to call fit")
		# Run a fitting operation
		atomic_convnet.fit(dataset)

deepchem/molnet/load_function/pdbbind_datasets.py

+6 −8

Original line number	Diff line number	Diff line
		@@ -209,21 +209,19 @@ def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
		labels = np.array(labels)
		# Featurize Data
		if featurizer == "grid":
		# TODO: This is not the correct setting. Set hyperparameters correctly
		ecfp_power = 5
		splif_power = 5
		featurizer = rgf.RdkitGridFeaturizer(
		voxel_width=16.0,
		feature_types=['ecfp', 'splif', 'hbond', 'salt_bridge'],
		ecfp_power=ecfp_power,
		splif_power=splif_power,
		voxel_width=2.0,
		feature_types=[
		'ecfp', 'splif', 'hbond', 'salt_bridge', 'pi_stack', 'cation_pi',
		'charge'
		],
		flatten=True)
		elif featurizer == "atomic":
		# Pulled from PDB files. For larger datasets with more PDBs, would use
		# max num atoms instead of exact.
		frag1_num_atoms = 70 # for ligand atoms
		frag2_num_atoms = 24000 # for protein atoms
		complex_num_atoms = 24060 # in total
		complex_num_atoms = 24070 # in total
		max_num_neighbors = 4
		# Cutoff in angstroms
		neighbor_cutoff = 4

examples/pdbbind/pdbbind_atomic_conv.py

0 → 100644

+49 −0

Original line number	Diff line number	Diff line
		"""
		Script that trains Atomic Conv models on PDBbind dataset.
		"""
		from __future__ import print_function
		from __future__ import division
		from __future__ import unicode_literals

		__author__ = "Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "MIT"

		import os
		import deepchem as dc
		import numpy as np
		from sklearn.ensemble import RandomForestRegressor
		from deepchem.molnet import load_pdbbind

		# For stable runs
		np.random.seed(123)

		pdbbind_tasks, pdbbind_datasets, transformers = load_pdbbind(
		featurizer="atomic", split="random", subset="core")
		train_dataset, valid_dataset, test_dataset = pdbbind_datasets

		metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)

		frag1_num_atoms = 70 # for ligand atoms
		frag2_num_atoms = 24000 # for protein atoms
		complex_num_atoms = frag1_num_atoms + frag2_num_atoms
		atomic_convnet = atomic_conv.AtomicConvModel(
		batch_size=batch_size,
		frag1_num_atoms=frag1_num_atoms,
		frag2_num_atoms=frag2_num_atoms,
		complex_num_atoms=complex_num_atoms)

		# Fit trained model
		print("Fitting model on train dataset")
		model.fit(train_dataset)
		model.save()

		print("Evaluating model")
		train_scores = model.evaluate(train_dataset, [metric], transformers)
		valid_scores = model.evaluate(valid_dataset, [metric], transformers)

		print("Train scores")
		print(train_scores)

		print("Validation scores")
		print(valid_scores)

Admin message