Merge pull request #1182 from lilleswing/sascore (a1410118) · Commits · 钟慕尧 / deepchem

deepchem/models/init.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -19,6 +19,7 @@ from deepchem.models.tensorgraph.robust_multitask import RobustMultitaskRegresso
		from deepchem.models.tensorgraph.progressive_multitask import ProgressiveMultitaskRegressor, ProgressiveMultitaskClassifier
		from deepchem.models.tensorgraph.models.graph_models import WeaveModel, DTNNModel, DAGModel, GraphConvModel, MPNNModel
		from deepchem.models.tensorgraph.models.symmetry_function_regression import BPSymmetryFunctionRegression, ANIRegression
		from deepchem.models.tensorgraph.models.scscore import ScScoreModel

		from deepchem.models.tensorgraph.models.seqtoseq import SeqToSeq
		from deepchem.models.tensorgraph.models.gan import GAN, WGAN

deepchem/models/tensorgraph/layers.py

+2 −2

Original line number	Diff line number	Diff line
		@@ -4638,7 +4638,7 @@ class GraphCNN(Layer):
		return result


		class Hingeloss(Layer):
		class HingeLoss(Layer):
		"""This layer computes the hinge loss on inputs:[labels,logits]
		labels: The values of this tensor is expected to be 1.0 or 0.0. The shape should be the same as logits.
		logits: Holds the log probabilities for labels, a float tensor.
		@@ -4646,7 +4646,7 @@ class Hingeloss(Layer):
		"""

		def __init__(self, in_layers=None, **kwargs):
		super(Hingeloss, self).__init__(in_layers, **kwargs)
		super(HingeLoss, self).__init__(in_layers, **kwargs)
		try:
		self._shape = self.in_layers[1].shape
		except:

deepchem/models/tensorgraph/models/scscore.py

0 → 100644

+129 −0

Original line number	Diff line number	Diff line
		import numpy as np
		import tensorflow as tf
		from deepchem.data import NumpyDataset
		from deepchem.feat import CircularFingerprint
		from deepchem.models.tensorgraph.layers import Dense, HingeLoss, Sigmoid, \
		WeightedError, Dropout
		from deepchem.models.tensorgraph.layers import Label, Weights, Feature
		from deepchem.models.tensorgraph.tensor_graph import TensorGraph


		class ScScoreModel(TensorGraph):
		"""
		https://pubs.acs.org/doi/abs/10.1021/acs.jcim.7b00622
		Several definitions of molecular complexity exist to facilitate prioritization
		of lead compounds, to identify diversity-inducing and complexifying reactions,
		and to guide retrosynthetic searches. In this work, we focus on synthetic
		complexity and reformalize its definition to correlate with the expected number
		of reaction steps required to produce a target molecule, with implicit knowledge
		about what compounds are reasonable starting materials. We train a neural
		network model on 12 million reactions from the Reaxys database to impose a
		pairwise inequality constraint enforcing the premise of this definition: that on
		average, the products of published chemical reactions should be more
		synthetically complex than their corresponding reactants. The learned metric
		(SCScore) exhibits highly desirable nonlinear behavior, particularly in
		recognizing increases in synthetic complexity throughout a number of linear
		synthetic routes.

		Our model here actually uses hingeloss instead of the shifted relu loss in
		https://github.com/connorcoley/scscore.

		This could cause issues differentiation issues with compounds that are "close"
		to each other in "complexity"

		"""

		def __init__(self,
		n_features,
		layer_sizes=[300, 300, 300],
		dropouts=0.0,
		**kwargs):
		"""
		Parameters
		----------
		n_features: int
		number of features per molecule
		layer_sizes: list of int
		size of each hidden layer
		dropouts: int
		droupout to apply to each hidden layer
		kwargs
		This takes all kwards as TensorGraph
		"""
		self.n_features = n_features
		self.layer_sizes = layer_sizes
		self.dropout = dropouts
		super(ScScoreModel, self).__init__(**kwargs)
		self.build_graph()

		def build_graph(self):
		"""
		Building graph structures:
		"""
		self.m1_features = Feature(shape=(None, self.n_features))
		self.m2_features = Feature(shape=(None, self.n_features))
		prev_layer1 = self.m1_features
		prev_layer2 = self.m2_features
		for layer_size in self.layer_sizes:
		prev_layer1 = Dense(
		out_channels=layer_size,
		in_layers=[prev_layer1],
		activation_fn=tf.nn.relu)
		prev_layer2 = prev_layer1.shared([prev_layer2])
		if self.dropout > 0.0:
		prev_layer1 = Dropout(self.dropout, in_layers=prev_layer1)
		prev_layer2 = Dropout(self.dropout, in_layers=prev_layer2)

		readout_m1 = Dense(
		out_channels=1, in_layers=[prev_layer1], activation_fn=None)
		readout_m2 = readout_m1.shared([prev_layer2])
		self.add_output(Sigmoid(readout_m1) * 4 + 1)
		self.add_output(Sigmoid(readout_m2) * 4 + 1)

		self.difference = readout_m1 - readout_m2
		label = Label(shape=(None, 1))
		loss = HingeLoss(in_layers=[label, self.difference])
		self.my_task_weights = Weights(shape=(None, 1))
		loss = WeightedError(in_layers=[loss, self.my_task_weights])
		self.set_loss(loss)

		def default_generator(self,
		dataset,
		epochs=1,
		predict=False,
		deterministic=True,
		pad_batches=True):
		for epoch in range(epochs):
		for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
		batch_size=self.batch_size,
		deterministic=deterministic,
		pad_batches=pad_batches):
		feed_dict = dict()
		feed_dict[self.m1_features] = X_b[:, 0]
		feed_dict[self.m2_features] = X_b[:, 1]
		if y_b is not None and not predict:
		feed_dict[self.labels[0]] = y_b
		if w_b is not None and not predict:
		feed_dict[self.my_task_weights] = w_b
		yield feed_dict

		def predict_mols(self, mols):
		featurizer = CircularFingerprint(
		size=self.n_features, radius=2, chiral=True)
		features = np.expand_dims(featurizer.featurize(mols), axis=1)
		features = np.concatenate([features, features], axis=1)
		ds = NumpyDataset(features, None, None, None)
		return self.predict(ds)[0][:, 0]

		def create_estimator_inputs(self, feature_columns, weight_column, features,
		labels, mode):
		tensors = {}
		for layer, column in zip([self.m1_features, self.m2_features],
		feature_columns):
		tensors[layer] = tf.feature_column.input_layer(features, [column])
		if weight_column is not None:
		tensors[self.task_weights[0]] = tf.feature_column.input_layer(
		features, [weight_column])
		if labels is not None:
		tensors[self.labels[0]] = tf.cast(labels, tf.int32)
		return tensors

deepchem/models/tensorgraph/models/test_sascore.py

0 → 100644

+33 −0

Original line number	Diff line number	Diff line
		import unittest

		import deepchem
		import numpy as np
		from deepchem.models import TensorGraph


		class TestSaScoreModel(unittest.TestCase):

		def test_save_load(self):
		"""Test SaScoreModel anc be saved and loaded"""
		n_samples = 10
		n_features = 3
		n_tasks = 1

		# Create a dataset and an input function for processing it.

		np.random.seed(123)
		X = np.random.rand(n_samples, 2, n_features)
		y = np.zeros((n_samples, n_tasks))
		dataset = deepchem.data.NumpyDataset(X, y)

		model = deepchem.models.ScScoreModel(n_features, dropouts=0)

		model.fit(dataset, nb_epoch=1)
		pred1 = model.predict(dataset)

		model.save()
		model = TensorGraph.load_from_dir(model.model_dir)

		pred2 = model.predict(dataset)
		for m1, m2 in zip(pred1, pred2):
		self.assertTrue(np.all(m1 == m2))

deepchem/models/tensorgraph/tests/test_estimators.py

+51 −0

Original line number	Diff line number	Diff line
		@@ -277,3 +277,54 @@ class TestEstimators(unittest.TestCase):

		results = estimator.evaluate(input_fn=lambda: input_fn(1))
		assert results['accuracy'] > 0.9

		def test_scscore(self):
		"""Test creating an Estimator from a ScScoreModel."""
		n_samples = 10
		n_features = 3
		n_tasks = 1

		# Create a dataset and an input function for processing it.

		np.random.seed(123)
		X = np.random.rand(n_samples, 2, n_features)
		y = np.zeros((n_samples, n_tasks))
		dataset = dc.data.NumpyDataset(X, y)

		def input_fn(epochs):
		x, y, weights = dataset.make_iterator(
		batch_size=n_samples, epochs=epochs).get_next()
		x1 = x[:, 0]
		x2 = x[:, 1]
		return {'x1': x1, 'x2': x2, 'weights': weights}, y

		# Create a TensorGraph model.

		model = dc.models.ScScoreModel(n_features, dropouts=0)
		del model.outputs[:]
		model.outputs.append(model.difference)

		def accuracy(labels, predictions, weights):
		predictions = tf.nn.relu(tf.sign(predictions))
		return tf.metrics.accuracy(labels, predictions, weights)

		# Create an estimator from it.

		x_col1 = tf.feature_column.numeric_column('x1', shape=(n_features,))
		x_col2 = tf.feature_column.numeric_column('x2', shape=(n_features,))
		weight_col = tf.feature_column.numeric_column('weights', shape=(1,))

		estimator = model.make_estimator(
		feature_columns=[x_col1, x_col2],
		metrics={'accuracy': accuracy},
		weight_column=weight_col)

		# Train the model.

		estimator.train(input_fn=lambda: input_fn(100))

		# Evaluate the model.

		results = estimator.evaluate(input_fn=lambda: input_fn(1))
		assert results['loss'] < 0.5
		assert results['accuracy'] > 0.6

Admin message