Commit 55437e02 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #418 from miaecle/HIV

HIV dataset
parents 7969343c 36440eb4
Loading
Loading
Loading
Loading
+34 −10
Original line number Diff line number Diff line
@@ -221,6 +221,12 @@ Index splitting
|           |Multitask network   |0.934              |0.830              |
|           |robust MT-NN        |0.949              |0.827              |
|           |graph convolution   |0.946              |0.860              |
|hiv        |logistic regression |0.864              |0.739              |
|           |Random Forest       |0.999              |0.720              |
|           |IRV                 |0.841              |0.724              |
|           |Multitask network   |0.761              |0.652              |
|           |robust MT-NN        |0.780              |0.708              |
|           |graph convolution   |0.876              |0.779              |

Random splitting

@@ -256,6 +262,12 @@ Random splitting
|           |Multitask network   |0.951              |0.834              |
|           |robust MT-NN        |0.959              |0.830              |
|           |graph convolution   |0.975              |0.876              |
|hiv        |logistic regression |0.860              |0.806              |
|           |Random Forest       |0.999              |0.850              |
|           |IRV                 |0.839              |0.809              |
|           |Multitask network   |0.742              |0.715              |
|           |robust MT-NN        |0.753              |0.727              |
|           |graph convolution   |0.847              |0.803              |

Scaffold splitting

@@ -291,6 +303,12 @@ Scaffold splitting
|           |Multitask network   |0.947              |0.862              |
|           |robust MT-NN        |0.953              |0.890              |
|           |graph convolution   |0.957              |0.823              |
|hiv        |logistic regression |0.858              |0.798              |
|           |Random Forest       |0.946              |0.562              |
|           |IRV                 |0.847              |0.811              |
|           |Multitask network   |0.775              |0.765              |
|           |robust MT-NN        |0.785              |0.748              |
|           |graph convolution   |0.867              |0.769              |

* Regression

@@ -329,20 +347,19 @@ Scaffold splitting
|chembl          |MT-NN regression    |Index       |0.443         |0.427         |
|                |MT-NN regression    |Random      |0.464         |0.434         |
|                |MT-NN regression    |Scaffold    |0.484         |0.361         |
|qm7             |NN regression       |Index       |0.994         |0.969         |
|                |NN regression       |Random      |0.995         |0.992         |
|                |NN regression       |Stratified  |0.992         |0.992         | 
|qm7b            |MT-NN regression    |Index       |0.883         |0.785         |
|                |MT-NN regression    |Random      |0.864         |0.838         |
|                |MT-NN regression    |Stratified  |0.871         |0.847         | 
|qm7             |NN regression       |Index       |0.997         |0.986         |
|                |NN regression       |Random      |0.999         |0.999         |
|                |NN regression       |Stratified  |0.999         |0.999         | 
|qm7b            |MT-NN regression    |Index       |0.931         |0.803         |
|                |MT-NN regression    |Random      |0.923         |0.884         |
|                |MT-NN regression    |Stratified  |0.934         |0.884         | 
|kaggle          |MT-NN regression    |User-defined|0.748         |0.452         |

|Dataset         |Model            |Splitting   |Train score/MAE(kcal/mol)|Valid score/MAE(kcal/mol)|
|----------------|-----------------|------------|-------------------------|-------------------------|
|qm7             |NN regression    |Index       |22.1                     |23.2                     |
|                |NN regression    |Random      |16.2                     |17.7                     |
|                |NN regression    |Stratified  |20.5                     |20.8                     |
|                |NN regression    |User-defined|9.0                      |9.5                      |
|qm7             |NN regression    |Index       |11.0                     |12.0                     |
|                |NN regression    |Random      |7.12                     |7.53                     |
|                |NN regression    |Stratified  |6.61                     |7.34                     |


* General features
@@ -357,6 +374,7 @@ Number of tasks and examples in the datasets
|sider           |27         |1427       |
|toxcast         |617        |8615       |
|clintox         |2          |1491       |
|hiv             |1          |41913      |
|delaney         |1          |1128       |
|sampl           |1          |643        |
|kaggle          |15         |173065     |
@@ -404,6 +422,12 @@ Time needed for benchmark test(~20h in total)
|                |random forest       |15              |200            |
|                |IRV                 |15              |10             |
|                |graph convolution   |20              |130            |
|hiv             |logistic regression |180             |40             |
|                |Multitask network   |180             |350            |
|                |robust MT-NN        |180             |450            |
|                |random forest       |180             |2800           |
|                |IRV                 |180             |200            |
|                |graph convolution   |180             |1300           |
|delaney         |MT-NN regression    |10              |40             |
|                |graphconv regression|10              |40             |
|                |random forest       |10              |30             |
+48 −53
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ from deepchem.utils.save import log
from deepchem.data import pad_features
from deepchem.metrics import to_one_hot


def weight_decay(penalty_type, penalty):
  # due to the different shape of weight(ndims=2) and bias(ndims=1),
  # will using this version for logreg
@@ -40,6 +41,7 @@ def weight_decay(penalty_type, penalty):

class TensorflowLogisticRegression(TensorflowGraphModel):
  """ A simple tensorflow based logistic regression model. """

  def build(self, graph, name_scopes, training):
    """Constructs the graph architecture of model: n_tasks * sigmoid nodes.

@@ -47,15 +49,13 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
      mol_features: Molecule descriptor (e.g. fingerprint) tensor with shape
        batch_size x n_features.
    """
    placeholder_scope = TensorflowGraph.get_placeholder_scope(
        graph, name_scopes)
    placeholder_scope = TensorflowGraph.get_placeholder_scope(graph,
                                                              name_scopes)
    n_features = self.n_features
    with graph.as_default():
      with placeholder_scope:
        self.mol_features = tf.placeholder(
            tf.float32,
            shape=[None, n_features],
            name='mol_features')
            tf.float32, shape=[None, n_features], name='mol_features')

      weight_init_stddevs = self.weight_init_stddevs
      bias_init_consts = self.bias_init_consts
@@ -66,23 +66,22 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
            tensor=self.mol_features,
            size=1,
            weight_init=tf.truncated_normal(
                shape=[self.n_features, 1],
                stddev=weight_init_stddevs[0]),
            bias_init=tf.constant(value=bias_init_consts[0],
                                  shape=[1]))
                shape=[self.n_features, 1], stddev=weight_init_stddevs[0]),
            bias_init=tf.constant(value=bias_init_consts[0], shape=[1]))
        lg_list.append(lg)

    return lg_list

  def add_label_placeholders(self, graph, name_scopes):
    #label placeholders with size batch_size * 1
    labels = []
    placeholder_scope = TensorflowGraph.get_placeholder_scope(graph, name_scopes)
    placeholder_scope = TensorflowGraph.get_placeholder_scope(graph,
                                                              name_scopes)
    with placeholder_scope:
      for task in range(self.n_tasks):
        labels.append(tf.identity(
            tf.placeholder(tf.float32, shape=[None,1],
                           name='labels_%d' % task)))
        labels.append(
            tf.identity(
                tf.placeholder(
                    tf.float32, shape=[None, 1], name='labels_%d' % task)))
    return labels

  def add_training_cost(self, graph, name_scopes, output, labels, weights):
@@ -94,8 +93,8 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
      with TensorflowGraph.shared_name_scope('costs', graph, name_scopes):
        for task in range(self.n_tasks):
          task_str = str(task).zfill(len(str(self.n_tasks)))
          with TensorflowGraph.shared_name_scope(
              'cost_{}'.format(task_str), graph, name_scopes):
          with TensorflowGraph.shared_name_scope('cost_{}'.format(task_str),
                                                 graph, name_scopes):
            with tf.name_scope('weighted'):
              weighted_cost = self.cost(output[task], labels[task],
                                        weights[task])
@@ -106,12 +105,13 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
              # non-zero weight examples in the batch.  Also, instead of using
              # tf.reduce_mean (which can put ops on the CPU) we explicitly
              # calculate with div/sum so it stays on the GPU.
              gradient_cost = tf.div(tf.reduce_sum(weighted_cost),
                                     self.batch_size)
              gradient_cost = tf.div(
                  tf.reduce_sum(weighted_cost), self.batch_size)
              gradient_costs.append(gradient_cost)

        # aggregated costs
        with TensorflowGraph.shared_name_scope('aggregated', graph, name_scopes):
        with TensorflowGraph.shared_name_scope('aggregated', graph,
                                               name_scopes):
          with tf.name_scope('gradient'):
            loss = tf.add_n(gradient_costs)

@@ -124,8 +124,8 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
      return loss

  def cost(self, logits, labels, weights):
    return tf.mul(tf.nn.sigmoid_cross_entropy_with_logits(logits, labels),
                  weights)
    return tf.mul(
        tf.nn.sigmoid_cross_entropy_with_logits(logits, labels), weights)

  def add_output_ops(self, graph, output):
    # adding output nodes of sigmoid function
@@ -153,8 +153,7 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
        orig_dict["weights_%d" % task] = w_b[:, task]
      else:
        # Dummy placeholders
        orig_dict["weights_%d" % task] = np.ones(
            (self.batch_size,)) 
        orig_dict["weights_%d" % task] = np.ones((self.batch_size,))
    return TensorflowGraph.get_feed_dict(orig_dict)

  def predict_proba_on_batch(self, X):
@@ -173,17 +172,15 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
        # transfer 2D prediction tensor to 2D x n_classes(=2) 
        complimentary = np.ones(np.shape(batch_outputs))
        complimentary = complimentary - batch_outputs
        batch_outputs = np.squeeze(np.stack(arrays = [complimentary,
						      batch_outputs],
                                            axis = 2))
        batch_outputs = np.concatenate(
            [complimentary, batch_outputs], axis=batch_outputs.ndim - 1)
        # reshape to batch_size x n_tasks x ...
        if batch_outputs.ndim == 3:
          batch_outputs = batch_outputs.transpose((1, 0, 2))
        elif batch_outputs.ndim == 2:
          batch_outputs = batch_outputs.transpose((1, 0))
        else:
          raise ValueError(
              'Unrecognized rank combination for output: %s ' %
          raise ValueError('Unrecognized rank combination for output: %s ' %
                           (batch_outputs.shape,))

      outputs = batch_outputs
@@ -211,21 +208,19 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
        # transfer 2D prediction tensor to 2D x n_classes(=2) 
        complimentary = np.ones(np.shape(batch_output))
        complimentary = complimentary - batch_output
        batch_output = np.squeeze(np.stack(arrays = [complimentary,
                                                     batch_output],
                                            axis = 2))
        batch_output = np.concatenate(
            [complimentary, batch_output], axis=batch_output.ndim - 1)
        # reshape to batch_size x n_tasks x ...
        if batch_output.ndim == 3:
          batch_output = batch_output.transpose((1, 0, 2))
        elif batch_output.ndim == 2:
          batch_output = batch_output.transpose((1, 0))
        else:
          raise ValueError(
              'Unrecognized rank combination for output: %s' %
          raise ValueError('Unrecognized rank combination for output: %s' %
                           (batch_output.shape,))
        output.append(batch_output)

        outputs = np.array(from_one_hot(
            np.squeeze(np.concatenate(output)), axis=-1))
        outputs = np.array(
            from_one_hot(np.squeeze(np.concatenate(output)), axis=-1))

    return np.copy(outputs)
+90 −11
Original line number Diff line number Diff line
@@ -8,8 +8,9 @@ from __future__ import unicode_literals
import os

import numpy as np

import time
import deepchem as dc
import tensorflow as tf
from deepchem.data import NumpyDataset


@@ -654,14 +655,46 @@ class IRVTransformer():
    """
    features = []
    similarity_xs = similarity * np.sign(w)
    for similarity_x in similarity_xs:
      pair = list(zip(similarity_x, range(len(similarity_x))))
      pair.sort(key=lambda x: x[0], reverse=True)
      if pair[0][0] >= 1:
        pair = pair[1:self.K + 1]
    [target_len, reference_len] = similarity_xs.shape
    g_temp = tf.Graph()
    values = []
    top_labels = []
    with g_temp.as_default():
      labels_tf = tf.constant(y)
      similarity_placeholder = tf.placeholder(
          dtype=tf.float64, shape=(None, reference_len))
      value, indice = tf.nn.top_k(
          similarity_placeholder, k=self.K + 1, sorted=True)
      # the tf graph here pick up the (K+1) highest similarity values 
      # and their indices
      top_label = tf.gather(labels_tf, indice)
      # map the indices to labels
      feed_dict = {}
      with tf.Session() as sess:
        for count in range(target_len // 100 + 1):
          feed_dict[similarity_placeholder] = similarity_xs[count * 100:min((
              count + 1) * 100, target_len), :]
          # generating batch of data by slicing similarity matrix 
          # into 100*reference_dataset_length
          fetched_values = sess.run([value, top_label], feed_dict=feed_dict)
          values.append(fetched_values[0])
          top_labels.append(fetched_values[1])
    values = np.concatenate(values, axis=0)
    top_labels = np.concatenate(top_labels, axis=0)
    # concatenate batches of data together
    for count in range(values.shape[0]):
      if values[count, 0] == 1:
        features.append(
            np.concatenate([
                values[count, 1:(self.K + 1)], top_labels[count, 1:(self.K + 1)]
            ]))
        # highest similarity is 1: target is in the reference
        # use the following K points
      else:
        pair = pair[:self.K]
      features.append([z[0] for z in pair] + [y[int(z[1])] for z in pair])
        features.append(
            np.concatenate(
                [values[count, 0:self.K], top_labels[count, 0:self.K]]))
        # highest less than 1: target not in the reference, use top K points
    return features

  def X_transform(self, X_target):
@@ -682,14 +715,60 @@ class IRVTransformer():
    """
    X_target2 = []
    n_features = X_target.shape[1]
    similarity = np.matmul(X_target, np.transpose(self.X)) / (
        n_features - np.matmul(1 - X_target, np.transpose(1 - self.X)))
    print('start similarity calculation')
    time1 = time.time()
    similarity = IRVTransformer.matrix_mul(X_target, np.transpose(self.X)) / (
        n_features - IRVTransformer.matrix_mul(1 - X_target,
                                               np.transpose(1 - self.X)))
    time2 = time.time()
    print('similarity calculation takes %i s' % (time2 - time1))
    for i in range(self.n_tasks):
      X_target2.append(self.realize(similarity, self.y[:, i], self.w[:, i]))
    return np.concatenate([z for z in np.array(X_target2)], axis=1)

  @staticmethod
  def matrix_mul(X1, X2, shard_size=5000):
    """ Calculate matrix multiplication for big matrix,
    X1 and X2 are sliced into pieces with shard_size rows(columns)
    then multiplied together and concatenated to the proper size
    """
    X1 = np.float_(X1)
    X2 = np.float_(X2)
    X1_shape = X1.shape
    X2_shape = X2.shape
    assert X1_shape[1] == X2_shape[0]
    X1_iter = X1_shape[0] // shard_size + 1
    X2_iter = X2_shape[1] // shard_size + 1
    all_result = np.zeros((1,))
    for X1_id in range(X1_iter):
      result = np.zeros((1,))
      for X2_id in range(X2_iter):
        partial_result = np.matmul(X1[X1_id * shard_size:min((
            X1_id + 1) * shard_size, X1_shape[0]), :],
                                   X2[:, X2_id * shard_size:min((
                                       X2_id + 1) * shard_size, X2_shape[1])])
        # calculate matrix multiplicatin on slices
        if result.size == 1:
          result = partial_result
        else:
          result = np.concatenate((result, partial_result), axis=1)
        # concatenate the slices together
        del partial_result
      if all_result.size == 1:
        all_result = result
      else:
        all_result = np.concatenate((all_result, result), axis=0)
      del result
    return all_result

  def transform(self, dataset):
    X_trans = self.X_transform(dataset.X)
    X_length = dataset.X.shape[0]
    X_trans = []
    for count in range(X_length // 5000 + 1):
      X_trans.append(
          self.X_transform(dataset.X[count * 5000:min((count + 1) * 5000,
                                                      X_length), :]))
    X_trans = np.concatenate(X_trans, axis=0)
    return NumpyDataset(X_trans, dataset.y, dataset.w, ids=None)

  def untransform(self, z):
+11 −7
Original line number Diff line number Diff line
@@ -12,7 +12,7 @@ Giving classification performances of:
    RobustMultitaskDNN(tf_robust),
    Logistic regression(logreg), IRV(irv)
    Graph convolution(graphconv)                 
on datasets: muv, pcba, tox21, sider, toxcast, clintox
on datasets: muv, pcba, tox21, sider, toxcast, clintox, hiv

Giving regression performances of:
    MultitaskDNN(tf_regression),
@@ -54,6 +54,7 @@ from chembl.chembl_datasets import load_chembl
from qm7.qm7_datasets import load_qm7_from_mat, load_qm7b_from_mat
from sampl.sampl_datasets import load_sampl
from clintox.clintox_datasets import load_clintox
from hiv.hiv_datasets import load_hiv


def benchmark_loading_datasets(hyper_parameters,
@@ -72,7 +73,7 @@ def benchmark_loading_datasets(hyper_parameters,
      hyper parameters including layer size, dropout, learning rate, etc.
  dataset: string, optional (default='tox21')
      choice of which dataset to use, should be: tox21, muv, sider, 
      toxcast, pcba, delaney, kaggle, nci, clintox, pdbbind, chembl,
      toxcast, pcba, delaney, kaggle, nci, clintox, hiv, pdbbind, chembl,
      qm7, qm7b, sampl
  model: string,  optional (default='tf')
      choice of which model to use, should be: rf, tf, tf_robust, logreg,
@@ -83,7 +84,7 @@ def benchmark_loading_datasets(hyper_parameters,
      path of result file
  """

  if dataset in ['muv', 'pcba', 'tox21', 'sider', 'toxcast', 'clintox']:
  if dataset in ['muv', 'pcba', 'tox21', 'sider', 'toxcast', 'clintox', 'hiv']:
    mode = 'classification'
  elif dataset in [
      'kaggle', 'delaney', 'nci', 'pdbbind', 'chembl', 'qm7', 'qm7b', 'sampl'
@@ -151,7 +152,8 @@ def benchmark_loading_datasets(hyper_parameters,
      'qm7': load_qm7_from_mat,
      'qm7b': load_qm7b_from_mat,
      'sampl': load_sampl,
      'clintox': load_clintox
      'clintox': load_clintox,
      'hiv': load_hiv
  }

  print('-------------------------------------')
@@ -786,7 +788,7 @@ if __name__ == '__main__':
      dest='dataset_args',
      default=[],
      help='Choice of dataset: tox21, sider, muv, toxcast, pcba, ' +
      'kaggle, delaney, nci, pdbbind, chembl, sampl, qm7, qm7b, clintox')
      'kaggle, delaney, nci, pdbbind, chembl, sampl, qm7, qm7b, clintox, hiv')
  parser.add_argument(
      '-t',
      action='store_true',
@@ -811,7 +813,7 @@ if __name__ == '__main__':
    #irv, rf, rf_regression should be assigned manually
  if len(datasets) == 0:
    datasets = [
        'tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'sampl',
        'tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'hiv', 'sampl',
        'delaney', 'nci', 'kaggle', 'pdbbind', 'chembl', 'qm7b'
    ]

@@ -898,7 +900,9 @@ if __name__ == '__main__':

  for split in splitters:
    for dataset in datasets:
      if dataset in ['tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox']:
      if dataset in [
          'tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'hiv'
      ]:
        for model in models:
          if model in ['tf', 'tf_robust', 'logreg', 'graphconv', 'rf', 'irv']:
            benchmark_loading_datasets(

examples/hiv/HIV.csv

0 → 100644
+41914 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading