Commit 73160191 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #642 from lilleswing/tg-graphconv-cr

TensorGraph GraphConv class
parents 7fb141b5 ad30fcc3
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -28,4 +28,4 @@ from deepchem.models.tensorflow_models.progressive_multitask import ProgressiveM
from deepchem.models.tensorflow_models.progressive_joint import ProgressiveJointRegressor
from deepchem.models.tensorflow_models.IRV import TensorflowMultiTaskIRVClassifier
from deepchem.models.tensorgraph.tensor_graph import TensorGraph
from deepchem.models.tensorgraph.models.graph_models import WeaveTensorGraph, DTNNTensorGraph, DAGTensorGraph
from deepchem.models.tensorgraph.models.graph_models import WeaveTensorGraph, DTNNTensorGraph, DAGTensorGraph, GraphConvTensorGraph
+0 −86
Original line number Diff line number Diff line
import tensorflow as tf
from deepchem.models.tensorgraph.tensor_graph import TensorGraph
from deepchem.models.tensorgraph.layers import Input, Dense, Concat, SoftMax, SoftMaxCrossEntropy, Layer, \
  GraphConv, BatchNorm, GraphPool, GraphGather, WeightedError
from deepchem.metrics import to_one_hot
from deepchem.feat.mol_graphs import ConvMol
import time


class GraphConvTensorGraph(TensorGraph):
  """
  """

  def __init__(self, **kwargs):
    super(GraphConvTensorGraph, self).__init__(**kwargs)
    self.min_degree = 0
    self.max_degree = 10

  def _construct_feed_dict(self, X_b, y_b, w_b, ids_b):
    feed_dict = dict()
    if y_b is not None:
      for index, label in enumerate(self.labels):
        feed_dict[label.out_tensor] = to_one_hot(y_b[:, index])
    if self.task_weights is not None and w_b is not None:
      feed_dict[self.task_weights[0].out_tensor] = w_b
    if self.features is not None:
      multiConvMol = ConvMol.agglomerate_mols(X_b)
      feed_dict[self.features[0].out_tensor] = multiConvMol.get_atom_features()
      feed_dict[self.features[1].out_tensor] = multiConvMol.deg_slice
      feed_dict[self.features[2].out_tensor] = multiConvMol.membership
      for i in range(self.max_degree):
        feed_dict[self.features[i + 3]
                  .out_tensor] = multiConvMol.get_deg_adjacency_lists()[i + 1]
    return feed_dict

  def fit(self,
          dataset,
          nb_epoch=10,
          max_checkpoints_to_keep=5,
          log_every_N_batches=50,
          checkpoint_interval=10):
    """
    TODO(LESWING) put this logic into tensor_graph or figure out how to use an input queue.
    Parameters
    ----------
    dataset
    nb_epoch
    max_checkpoints_to_keep
    log_every_N_batches
    checkpoint_interval

    Returns
    -------

    """
    if not self.built:
      self.build()
    with self._get_tf("Graph").as_default():
      time1 = time.time()
      train_op = self._get_tf('train_op')
      saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep)
      with tf.Session() as sess:
        self._initialize_weights(sess, saver)
        avg_loss, n_batches = 0.0, 0.0
        for epoch in range(nb_epoch):
          for ind, (X_b, y_b, w_b, ids_b) in enumerate(
              dataset.iterbatches(
                  self.batch_size, deterministic=True, pad_batches=True)):
            feed_dict = self._construct_feed_dict(X_b, y_b, w_b, ids_b)
            output_tensors = [x.out_tensor for x in self.outputs]
            fetches = output_tensors + [train_op, self.loss.out_tensor]
            fetched_values = sess.run(fetches, feed_dict=feed_dict)
            loss = fetched_values[-1]
            avg_loss += loss
            n_batches += 1
            self.global_step += 1
          if epoch % checkpoint_interval == checkpoint_interval - 1:
            saver.save(sess, self.save_file, global_step=self.global_step)
            avg_loss = float(avg_loss) / n_batches
            print('Ending epoch %d: Average loss %g' % (epoch, avg_loss))
        saver.save(sess, self.save_file, global_step=self.global_step)
        self.last_checkpoint = saver.last_checkpoints[-1]
      ############################################################## TIMING
      time2 = time.time()
      print("TIMING: model fitting took %0.3f s" % (time2 - time1))
      ############################################################## TIMING
+213 −63
Original line number Diff line number Diff line
import numpy as np
import tensorflow as tf
import six
import tensorflow as tf

from deepchem.models.tensorgraph.tensor_graph import TensorGraph
from deepchem.utils.evaluate import GeneratorEvaluator
from deepchem.models.tensorgraph.layers import Input, BatchNorm, Dense, \
    SoftMax, SoftMaxCrossEntropy, L2Loss, Concat, WeightedError, Label, Weights, Feature
from deepchem.feat.mol_graphs import ConvMol
from deepchem.metrics import to_one_hot, from_one_hot
from deepchem.models.tensorgraph.graph_layers import WeaveLayer, WeaveGather, \
    Combine_AP, Separate_AP, DTNNEmbedding, DTNNStep, DTNNGather, DAGLayer, DAGGather
from deepchem.metrics import to_one_hot, from_one_hot
from deepchem.models.tensorgraph.layers import Dense, Concat, SoftMax, SoftMaxCrossEntropy, GraphConv, BatchNorm, \
    GraphPool, GraphGather, WeightedError
from deepchem.models.tensorgraph.layers import L2Loss, Label, Weights, Feature
from deepchem.models.tensorgraph.tensor_graph import TensorGraph
from deepchem.trans import undo_transforms
from deepchem.utils.evaluate import GeneratorEvaluator


class WeaveTensorGraph(TensorGraph):
@@ -562,3 +564,151 @@ class DAGTensorGraph(TensorGraph):
            result = undo_transforms(result, transformers)
          results.append(result)
        return np.concatenate(results, axis=0)


class GraphConvTensorGraph(TensorGraph):

  def __init__(self, n_tasks, **kwargs):
    """
        Parameters
        ----------
        n_tasks: int
          Number of tasks

    """
    self.n_tasks = n_tasks
    kwargs['use_queue'] = False
    super(GraphConvTensorGraph, self).__init__(**kwargs)
    self.build_graph()

  def build_graph(self):
    """
    Building graph structures:
    """
    self.atom_features = Feature(shape=(None, 75))
    self.degree_slice = Feature(shape=(None, 2), dtype=tf.int32)
    self.membership = Feature(shape=(None,), dtype=tf.int32)

    self.deg_adjs = []
    for i in range(0, 10 + 1):
      deg_adj = Feature(shape=(None, i + 1), dtype=tf.int32)
      self.deg_adjs.append(deg_adj)
    gc1 = GraphConv(
        64,
        activation_fn=tf.nn.relu,
        in_layers=[self.atom_features, self.degree_slice, self.membership] +
        self.deg_adjs)
    batch_norm1 = BatchNorm(in_layers=[gc1])
    gp1 = GraphPool(in_layers=[batch_norm1, self.degree_slice, self.membership]
                    + self.deg_adjs)
    gc2 = GraphConv(
        64,
        activation_fn=tf.nn.relu,
        in_layers=[gp1, self.degree_slice, self.membership] + self.deg_adjs)
    batch_norm2 = BatchNorm(in_layers=[gc2])
    gp2 = GraphPool(in_layers=[batch_norm2, self.degree_slice, self.membership]
                    + self.deg_adjs)
    dense = Dense(out_channels=128, activation_fn=None, in_layers=[gp2])
    batch_norm3 = BatchNorm(in_layers=[dense])
    gg1 = GraphGather(
        batch_size=self.batch_size,
        activation_fn=tf.nn.tanh,
        in_layers=[batch_norm3, self.degree_slice, self.membership] +
        self.deg_adjs)

    costs = []
    self.my_labels = []
    for task in range(self.n_tasks):
      if self.mode == 'classification':
        classification = Dense(
            out_channels=2, activation_fn=None, in_layers=[gg1])

        softmax = SoftMax(in_layers=[classification])
        self.add_output(softmax)

        label = Label(shape=(None, 2))
        self.my_labels.append(label)
        cost = SoftMaxCrossEntropy(in_layers=[label, classification])
        costs.append(cost)
      if self.mode == 'regression':
        regression = Dense(out_channels=1, activation_fn=None, in_layers=[gg1])
        self.add_output(regression)

        label = Label(shape=(None, 1))
        self.my_labels.append(label)
        cost = L2Loss(in_layers=[label, regression])
        costs.append(cost)

    entropy = Concat(in_layers=costs, axis=-1)
    self.my_task_weights = Weights(shape=(None, self.n_tasks))
    loss = WeightedError(in_layers=[entropy, self.my_task_weights])
    self.set_loss(loss)

  def default_generator(self,
                        dataset,
                        epochs=1,
                        predict=False,
                        pad_batches=True):
    for epoch in range(epochs):
      for ind, (X_b, y_b, w_b, ids_b) in enumerate(
          dataset.iterbatches(
              self.batch_size, pad_batches=True, deterministic=True)):
        d = {}
        for index, label in enumerate(self.my_labels):
          if self.mode == 'classification':
            d[label] = to_one_hot(y_b[:, index])
          if self.mode == 'regression':
            d[label] = np.expand_dims(y_b[:, index], -1)
        d[self.my_task_weights] = w_b
        multiConvMol = ConvMol.agglomerate_mols(X_b)
        d[self.atom_features] = multiConvMol.get_atom_features()
        d[self.degree_slice] = multiConvMol.deg_slice
        d[self.membership] = multiConvMol.membership
        for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
          d[self.deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i]
        yield d

  def predict(self, dataset, transformers=[], batch_size=None):
    generator = self.default_generator(dataset, predict=True, pad_batches=False)
    return self.predict_on_generator(generator, transformers)

  def predict_proba(self, dataset, transformers=[], batch_size=None):
    generator = self.default_generator(dataset, predict=True, pad_batches=False)
    return self.predict_proba_on_generator(generator, transformers)

  def predict_on_generator(self, generator, transformers=[]):
    retval = self.predict_proba_on_generator(generator, transformers)
    if self.mode == 'classification':
      retval = np.expand_dims(from_one_hot(retval, axis=2), axis=1)
    return retval

  def predict_proba_on_generator(self, generator, transformers=[]):
    if not self.built:
      self.build()
    with self._get_tf("Graph").as_default():
      with tf.Session() as sess:
        saver = tf.train.Saver()
        saver.restore(sess, self.last_checkpoint)
        out_tensors = [x.out_tensor for x in self.outputs]
        results = []
        for feed_dict in generator:
          feed_dict = {
              self.layers[k.name].out_tensor: v
              for k, v in six.iteritems(feed_dict)
          }
          result = np.array(sess.run(out_tensors, feed_dict=feed_dict))
          if len(result.shape) == 3:
            result = np.transpose(result, axes=[1, 0, 2])
          if len(transformers) > 0:
            result = undo_transforms(result, transformers)
          results.append(result)
        return np.concatenate(results, axis=0)

  def evaluate(self, dataset, metrics, transformers=[], per_task_metrics=False):
    if not self.built:
      self.build()
    return self.evaluate_generator(
        self.default_generator(dataset),
        metrics,
        labels=self.my_labels,
        weights=[self.my_task_weights])
+57 −0
Original line number Diff line number Diff line
import numpy as np

import deepchem
from deepchem.data import NumpyDataset
from deepchem.models import GraphConvTensorGraph
from deepchem.models import TensorGraph
from deepchem.molnet.load_function.delaney_datasets import load_delaney


def get_dataset(mode='classification', featurizer='GraphConv'):
  data_points = 10
  tasks, all_dataset, transformers = load_delaney(featurizer)
  train, valid, test = all_dataset

  if mode == 'classification':
    y = np.random.randint(0, 2, size=(data_points, len(tasks)))
    metric = deepchem.metrics.Metric(
        deepchem.metrics.roc_auc_score, np.mean, mode="classification")
  if mode == 'regression':
    y = np.random.normal(size=(data_points, len(tasks)))
    metric = deepchem.metrics.Metric(
        deepchem.metrics.mean_absolute_error, mode="regression")

  ds = NumpyDataset(train.X[:10], y, train.w[:10], train.ids[:10])

  return tasks, ds, transformers, metric


def test_graph_conv_model():
  tasks, dataset, transformers, metric = get_dataset('classification',
                                                     'GraphConv')

  batch_size = 50
  model = GraphConvTensorGraph(
      len(tasks), batch_size=batch_size, mode='classification')

  model.fit(dataset, nb_epoch=1)
  scores = model.evaluate(dataset, [metric], transformers)

  model.save()
  model = TensorGraph.load_from_dir(model.model_dir)
  scores = model.evaluate(dataset, [metric], transformers)


def test_graph_conv_regression_model():
  tasks, dataset, transformers, metric = get_dataset('regression', 'GraphConv')

  batch_size = 50
  model = GraphConvTensorGraph(
      len(tasks), batch_size=batch_size, mode='regression')

  model.fit(dataset, nb_epoch=1)
  scores = model.evaluate(dataset, [metric], transformers)

  model.save()
  model = TensorGraph.load_from_dir(model.model_dir)
  scores = model.evaluate(dataset, [metric], transformers)
+9 −2
Original line number Diff line number Diff line
@@ -452,13 +452,17 @@ class TensorGraph(Model):

    if labels is None:
      raise ValueError
    n_tasks = len(self.outputs)
    n_classes = self.outputs[0].out_tensor.get_shape()[-1].value
    evaluator = GeneratorEvaluator(
        self,
        feed_dict_generator,
        transformers,
        labels=labels,
        outputs=outputs,
        weights=weights)
        weights=weights,
        n_tasks=n_tasks,
        n_classes=n_classes)
    if not per_task_metrics:
      scores = evaluator.compute_model_performance(metrics)
      return scores
@@ -539,6 +543,9 @@ class TensorGraph(Model):
      tensorgraph.built = False
      return tensorgraph

  def __del__(self):
    pass


def _enqueue_batch(tg, generator, graph, sess, coord):
  """
Loading