Merge pull request #495 from miaecle/DAG2 (7cafea66) · Commits · 钟慕尧 / deepchem

deepchem/data/datasets.py

+5 −6

Original line number	Diff line number	Diff line
		@@ -499,9 +499,9 @@ class DiskDataset(Dataset):
		w_next = np.zeros((0,) + (len(tasks),))
		ids_next = np.zeros((0,), dtype=object)
		for (X, y, w, ids) in self.itershards():
		X_next = np.vstack([X_next, X])
		y_next = np.vstack([y_next, y])
		w_next = np.vstack([w_next, w])
		X_next = np.concatenate([X_next, X], axis=0)
		y_next = np.concatenate([y_next, y], axis=0)
		w_next = np.concatenate([w_next, w], axis=0)
		ids_next = np.concatenate([ids_next, ids])
		while len(X_next) > shard_size:
		X_batch, X_next = X_next[:shard_size], X_next[shard_size:]
		@@ -526,9 +526,8 @@ class DiskDataset(Dataset):
		if not len(self.metadata_df):
		raise ValueError("No data in dataset.")
		sample_X = load_from_disk(
		os.path.join(self.data_dir, next(self.metadata_df.iterrows())[1]['X']))[
		0]
		return np.shape(sample_X)
		os.path.join(self.data_dir, next(self.metadata_df.iterrows())[1]['X']))
		return np.shape(sample_X)[1:]

		def get_shard_size(self):
		"""Gets size of shards on disk."""

deepchem/models/tests/test_overfit.py

+46 −0

Original line number	Diff line number	Diff line
		@@ -706,6 +706,52 @@ class TestOverfit(test_util.TensorFlowTestCase):

		assert scores[regression_metric.name] > .9

		def test_DAG_singletask_regression_overfit(self):
		"""Test DAG regressor multitask overfits tiny data."""
		np.random.seed(123)
		tf.set_random_seed(123)
		n_tasks = 1

		# Load mini log-solubility dataset.
		featurizer = dc.feat.ConvMolFeaturizer()
		tasks = ["outcome"]
		input_file = os.path.join(self.current_dir, "example_regression.csv")
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)
		dataset = loader.featurize(input_file)

		regression_metric = dc.metrics.Metric(
		dc.metrics.pearson_r2_score, task_averager=np.mean)

		n_feat = 75
		batch_size = 10
		transformer = dc.trans.DAGTransformer(max_atoms=50)
		dataset = transformer.transform(dataset)

		graph = dc.nn.SequentialDAGGraph(
		n_feat, batch_size=batch_size, max_atoms=50)
		graph.add(dc.nn.DAGLayer(30, n_feat, max_atoms=50))
		graph.add(dc.nn.DAGGather(max_atoms=50))

		model = dc.models.MultitaskGraphRegressor(
		graph,
		n_tasks,
		n_feat,
		batch_size=batch_size,
		learning_rate=0.005,
		learning_rate_decay_time=1000,
		optimizer_type="adam",
		beta1=.9,
		beta2=.999)

		# Fit trained model
		model.fit(dataset, nb_epoch=50)
		model.save()
		# Eval model on train
		scores = model.evaluate(dataset, [regression_metric])

		assert scores[regression_metric.name] > .8

		def test_siamese_singletask_classification_overfit(self):
		"""Test siamese singletask model overfits tiny data."""
		np.random.seed(123)

deepchem/models/tf_new_models/DTNN_regressor.py

+2 −0

Original line number	Diff line number	Diff line
		@@ -13,6 +13,8 @@ class DTNNGraphRegressor(MultitaskGraphRegressor):

		feat = self.model.return_outputs()
		feat_size = self.feat_dim
		# dimension of `feat` becomes Unknown after tf.tensordot operation
		# need to define dimension of W and b explicitly
		outputs = []
		W_list = []
		b_list = []

deepchem/models/tf_new_models/graph_models.py

+34 −1

Original line number	Diff line number	Diff line
		@@ -11,7 +11,7 @@ __license__ = "MIT"

		import tensorflow as tf
		from deepchem.nn.layers import GraphGather
		from deepchem.models.tf_new_models.graph_topology import GraphTopology, DTNNGraphTopology
		from deepchem.models.tf_new_models.graph_topology import GraphTopology, DTNNGraphTopology, DAGGraphTopology


		class SequentialGraph(object):
		@@ -129,6 +129,39 @@ class SequentialDTNNGraph(SequentialGraph):
		self.layers.append(layer)


		class SequentialDAGGraph(SequentialGraph):
		"""SequentialGraph for DAG models
		"""

		def __init__(self, n_feat, batch_size=50, max_atoms=50):
		"""
		Parameters
		----------
		n_feat: int
		Number of features per atom.
		batch_size: int, optional(default=50)
		Number of molecules in a batch
		max_atoms: int, optional(default=50)
		Maximum number of atoms in a molecule, should be defined based on dataset
		"""
		self.graph = tf.Graph()
		with self.graph.as_default():
		self.graph_topology = DAGGraphTopology(
		n_feat, batch_size, max_atoms=max_atoms)
		self.output = self.graph_topology.get_atom_features_placeholder()
		self.layers = []

		def add(self, layer):
		"""Adds a new layer to model."""
		with self.graph.as_default():
		if type(layer).__name__ in ['DAGLayer']:
		self.output = layer([self.output] +
		self.graph_topology.get_topology_placeholders())
		else:
		self.output = layer(self.output)
		self.layers.append(layer)


		class SequentialSupportGraph(object):
		"""An analog of Keras Sequential model for test/support models."""

deepchem/models/tf_new_models/graph_topology.py

+134 −0

Original line number	Diff line number	Diff line
		@@ -258,3 +258,137 @@ class DTNNGraphTopology(GraphTopology):
		steps = np.array([distance_min + i * step_size for i in range(n_distance)])
		distance_vector = np.exp(-np.square(distance - steps) / (2 * step_size**2))
		return distance_vector


		class DAGGraphTopology(GraphTopology):
		"""GraphTopology for DAG models
		"""

		def __init__(self, n_feat, batch_size, name='topology', max_atoms=50):

		self.n_feat = n_feat
		self.name = name
		self.max_atoms = max_atoms
		self.batch_size = batch_size
		self.atom_features_placeholder = tf.placeholder(
		dtype='float32',
		shape=(self.batch_size * self.max_atoms, self.n_feat),
		name=self.name + '_atom_features')

		self.parents_placeholder = tf.placeholder(
		dtype='int32',
		shape=(self.batch_size * self.max_atoms, self.max_atoms,
		self.max_atoms),
		# molecule * atom(graph) => step => features
		name=self.name + '_parents')

		self.calculation_orders_placeholder = tf.placeholder(
		dtype='int32',
		shape=(self.batch_size * self.max_atoms, self.max_atoms),
		# molecule * atom(graph) => step
		name=self.name + '_orders')

		self.membership_placeholder = tf.placeholder(
		dtype='int32',
		shape=(self.batch_size * self.max_atoms),
		name=self.name + '_membership')

		# Define the list of tensors to be used as topology
		self.topology = [
		self.parents_placeholder, self.calculation_orders_placeholder,
		self.membership_placeholder
		]

		self.inputs = [self.atom_features_placeholder]
		self.inputs += self.topology

		def get_parents_placeholder(self):
		return self.parents_placeholder

		def get_calculation_orders_placeholder(self):
		return self.calculation_orders_placeholder

		def batch_to_feed_dict(self, batch):
		"""Converts the current batch of mol_graphs into tensorflow feed_dict.

		Assigns the graph information in array of ConvMol objects to the
		placeholders tensors for DAG models

		params
		------
		batch : np.ndarray
		Array of ConvMol objects

		returns
		-------
		feed_dict : dict
		Can be merged with other feed_dicts for input into tensorflow
		"""

		atoms_per_mol = [mol.get_num_atoms() for mol in batch]
		n_atom_features = batch[0].get_atom_features().shape[1]
		membership = np.concatenate(
		[
		np.array([1] * n_atoms + [0] * (self.max_atoms - n_atoms))
		for i, n_atoms in enumerate(atoms_per_mol)
		],
		axis=0)

		atoms_all = []
		# calculation orders for a batch of molecules
		parents_all = []
		calculation_orders = []
		for idm, mol in enumerate(batch):
		# padding atom features vector of each molecule with 0
		atom_features_padded = np.concatenate(
		[
		mol.get_atom_features(), np.zeros(
		(self.max_atoms - atoms_per_mol[idm], n_atom_features))
		],
		axis=0)
		atoms_all.append(atom_features_padded)

		# calculation orders for DAGs
		parents = mol.parents
		# number of DAGs should equal number of atoms
		assert len(parents) == atoms_per_mol[idm]
		parents_all.extend(parents[:])
		# padding with `max_atoms`
		parents_all.extend([
		self.max_atoms * np.ones((self.max_atoms, self.max_atoms), dtype=int)
		for i in range(self.max_atoms - atoms_per_mol[idm])
		])
		for parent in parents:
		# index for an atom in `parents_all` and `atoms_all` is different,
		# this function changes the index from the position in current molecule(DAGs, `parents_all`)
		# to position in batch of molecules(`atoms_all`)
		# only used in tf.gather on `atom_features_placeholder`
		calculation_orders.append(self.index_changing(parent[:, 0], idm))

		# padding with `batch_size*max_atoms`
		calculation_orders.extend([
		self.batch_size * self.max_atoms * np.ones(
		(self.max_atoms,), dtype=int)
		for i in range(self.max_atoms - atoms_per_mol[idm])
		])

		atoms_all = np.concatenate(atoms_all, axis=0)
		parents_all = np.stack(parents_all, axis=0)
		calculation_orders = np.stack(calculation_orders, axis=0)
		atoms_dict = {
		self.atom_features_placeholder: atoms_all,
		self.membership_placeholder: membership,
		self.parents_placeholder: parents_all,
		self.calculation_orders_placeholder: calculation_orders
		}

		return atoms_dict

		def index_changing(self, index, n_mol):
		output = np.zeros_like(index)
		for ide, element in enumerate(index):
		if element < self.max_atoms:
		output[ide] = element + n_mol * self.max_atoms
		else:
		output[ide] = self.batch_size * self.max_atoms
		return output

Admin message