Unverified Commit a1410118 authored by Karl Leswing's avatar Karl Leswing Committed by GitHub
Browse files

Merge pull request #1182 from lilleswing/sascore

SAScore module
parents adfbb040 be54f13a
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@ from deepchem.models.tensorgraph.robust_multitask import RobustMultitaskRegresso
from deepchem.models.tensorgraph.progressive_multitask import ProgressiveMultitaskRegressor, ProgressiveMultitaskClassifier
from deepchem.models.tensorgraph.models.graph_models import WeaveModel, DTNNModel, DAGModel, GraphConvModel, MPNNModel
from deepchem.models.tensorgraph.models.symmetry_function_regression import BPSymmetryFunctionRegression, ANIRegression
from deepchem.models.tensorgraph.models.scscore import ScScoreModel

from deepchem.models.tensorgraph.models.seqtoseq import SeqToSeq
from deepchem.models.tensorgraph.models.gan import GAN, WGAN
+2 −2
Original line number Diff line number Diff line
@@ -4638,7 +4638,7 @@ class GraphCNN(Layer):
    return result


class Hingeloss(Layer):
class HingeLoss(Layer):
  """This layer computes the hinge loss on inputs:[labels,logits]
  labels: The values of this tensor is expected to be 1.0 or 0.0. The shape should be the same as logits.
  logits: Holds the log probabilities for labels, a float tensor.
@@ -4646,7 +4646,7 @@ class Hingeloss(Layer):
  """

  def __init__(self, in_layers=None, **kwargs):
    super(Hingeloss, self).__init__(in_layers, **kwargs)
    super(HingeLoss, self).__init__(in_layers, **kwargs)
    try:
      self._shape = self.in_layers[1].shape
    except:
+129 −0
Original line number Diff line number Diff line
import numpy as np
import tensorflow as tf
from deepchem.data import NumpyDataset
from deepchem.feat import CircularFingerprint
from deepchem.models.tensorgraph.layers import Dense, HingeLoss, Sigmoid, \
  WeightedError, Dropout
from deepchem.models.tensorgraph.layers import Label, Weights, Feature
from deepchem.models.tensorgraph.tensor_graph import TensorGraph


class ScScoreModel(TensorGraph):
  """
  https://pubs.acs.org/doi/abs/10.1021/acs.jcim.7b00622
  Several definitions of molecular complexity exist to facilitate prioritization
  of lead compounds, to identify diversity-inducing and complexifying reactions,
  and to guide retrosynthetic searches. In this work, we focus on synthetic
  complexity and reformalize its definition to correlate with the expected number
  of reaction steps required to produce a target molecule, with implicit knowledge
  about what compounds are reasonable starting materials. We train a neural
  network model on 12 million reactions from the Reaxys database to impose a
  pairwise inequality constraint enforcing the premise of this definition: that on
  average, the products of published chemical reactions should be more
  synthetically complex than their corresponding reactants. The learned metric
  (SCScore) exhibits highly desirable nonlinear behavior, particularly in
  recognizing increases in synthetic complexity throughout a number of linear
  synthetic routes.

  Our model here actually uses hingeloss instead of the shifted relu loss in
  https://github.com/connorcoley/scscore.

  This could cause issues differentiation issues with compounds that are "close"
  to each other in "complexity"

  """

  def __init__(self,
               n_features,
               layer_sizes=[300, 300, 300],
               dropouts=0.0,
               **kwargs):
    """
    Parameters
    ----------
    n_features: int
      number of features per molecule
    layer_sizes: list of int
      size of each hidden layer
    dropouts: int
      droupout to apply to each hidden layer
    kwargs
      This takes all kwards as TensorGraph
    """
    self.n_features = n_features
    self.layer_sizes = layer_sizes
    self.dropout = dropouts
    super(ScScoreModel, self).__init__(**kwargs)
    self.build_graph()

  def build_graph(self):
    """
    Building graph structures:
    """
    self.m1_features = Feature(shape=(None, self.n_features))
    self.m2_features = Feature(shape=(None, self.n_features))
    prev_layer1 = self.m1_features
    prev_layer2 = self.m2_features
    for layer_size in self.layer_sizes:
      prev_layer1 = Dense(
          out_channels=layer_size,
          in_layers=[prev_layer1],
          activation_fn=tf.nn.relu)
      prev_layer2 = prev_layer1.shared([prev_layer2])
      if self.dropout > 0.0:
        prev_layer1 = Dropout(self.dropout, in_layers=prev_layer1)
        prev_layer2 = Dropout(self.dropout, in_layers=prev_layer2)

    readout_m1 = Dense(
        out_channels=1, in_layers=[prev_layer1], activation_fn=None)
    readout_m2 = readout_m1.shared([prev_layer2])
    self.add_output(Sigmoid(readout_m1) * 4 + 1)
    self.add_output(Sigmoid(readout_m2) * 4 + 1)

    self.difference = readout_m1 - readout_m2
    label = Label(shape=(None, 1))
    loss = HingeLoss(in_layers=[label, self.difference])
    self.my_task_weights = Weights(shape=(None, 1))
    loss = WeightedError(in_layers=[loss, self.my_task_weights])
    self.set_loss(loss)

  def default_generator(self,
                        dataset,
                        epochs=1,
                        predict=False,
                        deterministic=True,
                        pad_batches=True):
    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):
        feed_dict = dict()
        feed_dict[self.m1_features] = X_b[:, 0]
        feed_dict[self.m2_features] = X_b[:, 1]
        if y_b is not None and not predict:
          feed_dict[self.labels[0]] = y_b
        if w_b is not None and not predict:
          feed_dict[self.my_task_weights] = w_b
        yield feed_dict

  def predict_mols(self, mols):
    featurizer = CircularFingerprint(
        size=self.n_features, radius=2, chiral=True)
    features = np.expand_dims(featurizer.featurize(mols), axis=1)
    features = np.concatenate([features, features], axis=1)
    ds = NumpyDataset(features, None, None, None)
    return self.predict(ds)[0][:, 0]

  def create_estimator_inputs(self, feature_columns, weight_column, features,
                              labels, mode):
    tensors = {}
    for layer, column in zip([self.m1_features, self.m2_features],
                             feature_columns):
      tensors[layer] = tf.feature_column.input_layer(features, [column])
    if weight_column is not None:
      tensors[self.task_weights[0]] = tf.feature_column.input_layer(
          features, [weight_column])
    if labels is not None:
      tensors[self.labels[0]] = tf.cast(labels, tf.int32)
    return tensors
+33 −0
Original line number Diff line number Diff line
import unittest

import deepchem
import numpy as np
from deepchem.models import TensorGraph


class TestSaScoreModel(unittest.TestCase):

  def test_save_load(self):
    """Test SaScoreModel anc be saved and loaded"""
    n_samples = 10
    n_features = 3
    n_tasks = 1

    # Create a dataset and an input function for processing it.

    np.random.seed(123)
    X = np.random.rand(n_samples, 2, n_features)
    y = np.zeros((n_samples, n_tasks))
    dataset = deepchem.data.NumpyDataset(X, y)

    model = deepchem.models.ScScoreModel(n_features, dropouts=0)

    model.fit(dataset, nb_epoch=1)
    pred1 = model.predict(dataset)

    model.save()
    model = TensorGraph.load_from_dir(model.model_dir)

    pred2 = model.predict(dataset)
    for m1, m2 in zip(pred1, pred2):
      self.assertTrue(np.all(m1 == m2))
+51 −0
Original line number Diff line number Diff line
@@ -277,3 +277,54 @@ class TestEstimators(unittest.TestCase):

    results = estimator.evaluate(input_fn=lambda: input_fn(1))
    assert results['accuracy'] > 0.9

  def test_scscore(self):
    """Test creating an Estimator from a ScScoreModel."""
    n_samples = 10
    n_features = 3
    n_tasks = 1

    # Create a dataset and an input function for processing it.

    np.random.seed(123)
    X = np.random.rand(n_samples, 2, n_features)
    y = np.zeros((n_samples, n_tasks))
    dataset = dc.data.NumpyDataset(X, y)

    def input_fn(epochs):
      x, y, weights = dataset.make_iterator(
          batch_size=n_samples, epochs=epochs).get_next()
      x1 = x[:, 0]
      x2 = x[:, 1]
      return {'x1': x1, 'x2': x2, 'weights': weights}, y

    # Create a TensorGraph model.

    model = dc.models.ScScoreModel(n_features, dropouts=0)
    del model.outputs[:]
    model.outputs.append(model.difference)

    def accuracy(labels, predictions, weights):
      predictions = tf.nn.relu(tf.sign(predictions))
      return tf.metrics.accuracy(labels, predictions, weights)

    # Create an estimator from it.

    x_col1 = tf.feature_column.numeric_column('x1', shape=(n_features,))
    x_col2 = tf.feature_column.numeric_column('x2', shape=(n_features,))
    weight_col = tf.feature_column.numeric_column('weights', shape=(1,))

    estimator = model.make_estimator(
        feature_columns=[x_col1, x_col2],
        metrics={'accuracy': accuracy},
        weight_column=weight_col)

    # Train the model.

    estimator.train(input_fn=lambda: input_fn(100))

    # Evaluate the model.

    results = estimator.evaluate(input_fn=lambda: input_fn(1))
    assert results['loss'] < 0.5
    assert results['accuracy'] > 0.6
Loading