Commit 735df602 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #285 from miaecle/Benchmark2

Benchmark script and scores
parents c8491ef0 2d1e038e
Loading
Loading
Loading
Loading
+17 −6
Original line number Diff line number Diff line
@@ -205,11 +205,22 @@ different subclasses of ``Featurizer`` for convenience:
### Performances
|Dataset    |Model               |Train score/ROC-AUC|Valid score/ROC-AUC|Time(loading)/s |Time(running)/s|
|-----------|--------------------|-------------------|-------------------|----------------|---------------| 
|tox21      |tensorflow(MT-DNN)  |0.987              |0.800              |35              |36             |
|muv        |tensorflow(MT-DNN)  |0.979              |0.660              |414             |255            |
|pcba       |tensorflow(MT-DNN)	 |0.949        	     |0.791              |1765            |7209           |                                         
|sider      |tensorflow(MT-DNN)	 |0.864        	     |0.627              |10              |63             |                                         
|toxcast    |tensorflow(MT-DNN)	 |0.944        	     |0.697              |75              |2374           |                                         
|tox21      |logistic regression |0.910              |0.759              |30              |30             |
|           |tensorflow(MT-NN)   |0.987              |0.800              |30              |30             |
|           |graph convolution   |0.930              |0.819              |40              |40             |
|muv        |logistic regression |0.910              |0.744              |600             |800            |
|           |tensorflow(MT-NN)   |0.980              |0.710              |600             |800            |
|           |graph convolution   |0.881              |0.832              |800             |1200           |
|pcba       |logistic regression |0.759        	     |0.736              |1800            |5400           |                                         
|           |tensorflow(MT-NN)	 |0.949        	     |0.791              |1800            |7200           |                                         
|           |graph convolution   |0.866        	     |0.836              |2200            |20000          |                                         
|sider      |logistic regression |0.900        	     |0.620              |15              |40             |                                         
|           |tensorflow(MT-NN)	 |0.931        	     |0.647              |15              |60             |                                         
|           |graph convolution   |0.845        	     |0.646              |20              |60             |                                         
|toxcast    |logistic regression |0.762        	     |0.622              |80              |2000           |                                         
|           |tensorflow(MT-NN)	 |0.926        	     |0.705              |80              |2400           |                                         
|           |graph convolution   |0.906        	     |0.725              |80              |3000           |                                         


## Contributing to DeepChem

+1 −0
Original line number Diff line number Diff line
@@ -20,3 +20,4 @@ from deepchem.models.keras_models.fcnet import MultiTaskDNN
from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskRegressor
from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskClassifier
from deepchem.models.tensorflow_models.robust_multitask import RobustMultitaskRegressor
from deepchem.models.tensorflow_models.lr import TensorflowLogisticRegression
 No newline at end of file
+231 −0
Original line number Diff line number Diff line
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 08 14:10:02 2016

@author: Zhenqin Wu
"""
import tensorflow as tf
import numpy as np
import os
import time

from deepchem.metrics import from_one_hot
from deepchem.models.tensorflow_models import TensorflowGraph
from deepchem.models.tensorflow_models import TensorflowGraphModel
from deepchem.models.tensorflow_models import model_ops
from deepchem.utils.save import log
from deepchem.data import pad_features
from deepchem.metrics import to_one_hot

def weight_decay(penalty_type, penalty):
  # due to the different shape of weight(ndims=2) and bias(ndims=1),
  # will using this version for logreg
  variables = []
  # exclude bias variables
  for v in tf.trainable_variables():
    if v.get_shape().as_list()[0] > 1:
      variables.append(v)

  with tf.name_scope('weight_decay'):
    if penalty_type == 'l1':
      cost = tf.add_n([tf.reduce_sum(tf.abs(v)) for v in variables])
    elif penalty_type == 'l2':
      cost = tf.add_n([tf.nn.l2_loss(v) for v in variables])
    else:
      raise NotImplementedError('Unsupported penalty_type %s' % penalty_type)
    cost *= penalty
    tf.scalar_summary('Weight Decay Cost', cost)
  return cost    
    
    
class TensorflowLogisticRegression(TensorflowGraphModel):
  """ A simple tensorflow based logistic regression model. """
  def build(self, graph, name_scopes, training):
    """Constructs the graph architecture of model: n_tasks * sigmoid nodes.

    This method creates the following Placeholders:
      mol_features: Molecule descriptor (e.g. fingerprint) tensor with shape
        batch_size x n_features.
    """
    placeholder_scope = TensorflowGraph.get_placeholder_scope(
        graph, name_scopes)
    n_features = self.n_features
    with graph.as_default():
      with placeholder_scope:
        self.mol_features = tf.placeholder(
            tf.float32,
            shape=[None, n_features],
            name='mol_features')

      weight_init_stddevs = self.weight_init_stddevs
      bias_init_consts = self.bias_init_consts
      lg_list = []
      for task in range(self.n_tasks):
        #setting up n_tasks nodes(output nodes)
        lg = model_ops.fully_connected_layer(
            tensor=self.mol_features,
            size = 1,
            weight_init=tf.truncated_normal(
                shape=[self.n_features, 1],
                stddev=weight_init_stddevs[0]),
            bias_init=tf.constant(value=bias_init_consts[0],
                                  shape=[1]))
        lg_list.append(lg)

    return lg_list
    
  def add_label_placeholders(self, graph, name_scopes):
    #label placeholders with size batch_size * 1
    labels = []
    placeholder_scope = TensorflowGraph.get_placeholder_scope(graph, name_scopes)
    with placeholder_scope:
      for task in range(self.n_tasks):
        labels.append(tf.identity(
            tf.placeholder(tf.float32, shape=[None,1],
                           name='labels_%d' % task)))
    return labels
      
  def add_training_cost(self, graph, name_scopes, output, labels, weights):
    with graph.as_default():
      epsilon = 1e-3  # small float to avoid dividing by zero
      weighted_costs = []  # weighted costs for each example
      gradient_costs = []  # costs used for gradient calculation

      with TensorflowGraph.shared_name_scope('costs', graph, name_scopes):
        for task in range(self.n_tasks):
          task_str = str(task).zfill(len(str(self.n_tasks)))
          with TensorflowGraph.shared_name_scope(
              'cost_{}'.format(task_str), graph, name_scopes):
            with tf.name_scope('weighted'):
              weighted_cost = self.cost(output[task], labels[task],
                                        weights[task])
              weighted_costs.append(weighted_cost)

            with tf.name_scope('gradient'):
              # Note that we divide by the batch size and not the number of
              # non-zero weight examples in the batch.  Also, instead of using
              # tf.reduce_mean (which can put ops on the CPU) we explicitly
              # calculate with div/sum so it stays on the GPU.
              gradient_cost = tf.div(tf.reduce_sum(weighted_cost),
                                     self.batch_size)
              gradient_costs.append(gradient_cost)

        # aggregated costs
        with TensorflowGraph.shared_name_scope('aggregated', graph, name_scopes):
          with tf.name_scope('gradient'):
            loss = tf.add_n(gradient_costs)

          # weight decay
          if self.penalty != 0.0:
            # using self-defined regularization
            penalty = weight_decay(self.penalty_type, self.penalty)
            loss += penalty

      return loss 
  
  def cost(self, logits, labels, weights):
    return tf.mul(tf.nn.sigmoid_cross_entropy_with_logits(logits, labels),
                  weights)
      
  def add_output_ops(self, graph, output):
    # adding output nodes of sigmoid function
    with graph.as_default():
      sigmoid = []
      with tf.name_scope('inference'):
        for i, logits in enumerate(output):
          sigmoid.append(tf.nn.sigmoid(logits, name='sigmoid_%d' % i))
      output = sigmoid
    return output 
    
  def construct_feed_dict(self, X_b, y_b=None, w_b=None, ids_b=None):

    orig_dict = {}
    orig_dict["mol_features"] = X_b
    for task in range(self.n_tasks):
      if y_b is not None:
        y_2column = to_one_hot(y_b[:, task])
        # fix the size to be [?,1]
        orig_dict["labels_%d" % task] = y_2column[:,1:2]
      else:
        # Dummy placeholders
        orig_dict["labels_%d" % task] = np.zeros((self.batch_size,1))
      if w_b is not None:
        orig_dict["weights_%d" % task] = w_b[:, task]
      else:
        # Dummy placeholders
        orig_dict["weights_%d" % task] = np.ones(
            (self.batch_size,)) 
    return TensorflowGraph.get_feed_dict(orig_dict)
  
  def predict_proba_on_batch(self, X, pad_batch=False):
    if pad_batch:
      X = pad_features(self.batch_size, X)
    if not self._restored_model:
      self.restore()
    with self.eval_graph.graph.as_default():
      # run eval data through the model
      n_tasks = self.n_tasks
      with self._get_shared_session(train=False).as_default():
        feed_dict = self.construct_feed_dict(X)
        data = self._get_shared_session(train=False).run(
            self.eval_graph.output, feed_dict=feed_dict)
        batch_outputs = np.asarray(data[:n_tasks], dtype=float)
        # transfer 2D prediction tensor to 2D x n_classes(=2) 
        complimentary = np.ones(np.shape(batch_outputs))
        complimentary = complimentary - batch_outputs
        batch_outputs = np.squeeze(np.stack(arrays = [complimentary,
						      batch_outputs],
                                            axis = 2))
        # reshape to batch_size x n_tasks x ...
        if batch_outputs.ndim == 3:
          batch_outputs = batch_outputs.transpose((1, 0, 2))
        elif batch_outputs.ndim == 2:
          batch_outputs = batch_outputs.transpose((1, 0))
        else:
          raise ValueError(
              'Unrecognized rank combination for output: %s ' %
              (batch_outputs.shape,))

      outputs = batch_outputs

    return np.copy(outputs)

  def predict_on_batch(self, X, pad_batch=False):
    
    if pad_batch:
      X = pad_features(self.batch_size, X)
    
    if not self._restored_model:
      self.restore()
    with self.eval_graph.graph.as_default():

      # run eval data through the model
      n_tasks = self.n_tasks
      output = []
      start = time.time()
      with self._get_shared_session(train=False).as_default():
        feed_dict = self.construct_feed_dict(X)
        data = self._get_shared_session(train=False).run(
            self.eval_graph.output, feed_dict=feed_dict)
        batch_output = np.asarray(data[:n_tasks], dtype=float)
        # transfer 2D prediction tensor to 2D x n_classes(=2) 
        complimentary = np.ones(np.shape(batch_output))
        complimentary = complimentary - batch_output
        batch_output = np.squeeze(np.stack(arrays = [complimentary,
                                                     batch_output],
                                            axis = 2))
        # reshape to batch_size x n_tasks x ...
        if batch_output.ndim == 3:
          batch_output = batch_output.transpose((1, 0, 2))
        elif batch_output.ndim == 2:
          batch_output = batch_output.transpose((1, 0))
        else:
          raise ValueError(
              'Unrecognized rank combination for output: %s' %
              (batch_output.shape,))
        output.append(batch_output)

        outputs = np.array(from_one_hot(
            np.squeeze(np.concatenate(output)), axis=-1))

    return np.copy(outputs)
+1 −1
Original line number Diff line number Diff line
@@ -181,7 +181,7 @@ def weight_decay(penalty_type, penalty):

  with tf.name_scope('weight_decay'):
    if penalty_type == 'l1':
      cost = tf.add_n([tf.reduce_sum(tf.Abs(v)) for v in variables])
      cost = tf.add_n([tf.reduce_sum(tf.abs(v)) for v in variables])
    elif penalty_type == 'l2':
      cost = tf.add_n([tf.nn.l2_loss(v) for v in variables])
    else:
+32 −0
Original line number Diff line number Diff line
@@ -463,6 +463,38 @@ class TestOverfit(test_util.TensorFlowTestCase):
    scores = model.evaluate(dataset, [classification_metric])
    assert scores[classification_metric.name] > .9


  def test_tf_logreg_multitask_classification_overfit(self):
    """Test tf multitask overfits tiny data."""
    n_tasks = 10
    n_samples = 10
    n_features = 3
    n_classes = 2
    
    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.zeros((n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    dataset = dc.data.NumpyDataset(X, y, w, ids)

    verbosity = "high"
    classification_metric = dc.metrics.Metric(
      dc.metrics.accuracy_score, verbosity=verbosity, task_averager=np.mean)
    tensorflow_model = dc.models.TensorflowLogisticRegression(
        n_tasks, n_features, learning_rate=0.5, weight_init_stddevs=[.01],
        batch_size=n_samples, verbosity=verbosity)
    model = dc.models.TensorflowModel(tensorflow_model)

    # Fit trained model
    model.fit(dataset)
    model.save()

    # Eval model on train
    scores = model.evaluate(dataset, [classification_metric])
    assert scores[classification_metric.name] > .9

  def test_sklearn_multitask_regression_overfit(self):
    """Test SKLearn singletask-to-multitask overfits tiny regression data."""
    n_tasks = 2
Loading