Commit e99ed5a0 authored by miaecle's avatar miaecle
Browse files

style change

parent 5531c218
Loading
Loading
Loading
Loading
+48 −50
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ from deepchem.utils.save import log
from deepchem.data import pad_features
from deepchem.metrics import to_one_hot


def weight_decay(penalty_type, penalty):
  # due to the different shape of weight(ndims=2) and bias(ndims=1),
  # will using this version for logreg
@@ -40,6 +41,7 @@ def weight_decay(penalty_type, penalty):

class TensorflowLogisticRegression(TensorflowGraphModel):
  """ A simple tensorflow based logistic regression model. """

  def build(self, graph, name_scopes, training):
    """Constructs the graph architecture of model: n_tasks * sigmoid nodes.

@@ -47,15 +49,13 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
      mol_features: Molecule descriptor (e.g. fingerprint) tensor with shape
        batch_size x n_features.
    """
    placeholder_scope = TensorflowGraph.get_placeholder_scope(
        graph, name_scopes)
    placeholder_scope = TensorflowGraph.get_placeholder_scope(graph,
                                                              name_scopes)
    n_features = self.n_features
    with graph.as_default():
      with placeholder_scope:
        self.mol_features = tf.placeholder(
            tf.float32,
            shape=[None, n_features],
            name='mol_features')
            tf.float32, shape=[None, n_features], name='mol_features')

      weight_init_stddevs = self.weight_init_stddevs
      bias_init_consts = self.bias_init_consts
@@ -66,22 +66,22 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
            tensor=self.mol_features,
            size=1,
            weight_init=tf.truncated_normal(
                shape=[self.n_features, 1],
                stddev=weight_init_stddevs[0]),
            bias_init=tf.constant(value=bias_init_consts[0],
                                  shape=[1]))
                shape=[self.n_features, 1], stddev=weight_init_stddevs[0]),
            bias_init=tf.constant(value=bias_init_consts[0], shape=[1]))
        lg_list.append(lg)
    return lg_list

  def add_label_placeholders(self, graph, name_scopes):
    #label placeholders with size batch_size * 1
    labels = []
    placeholder_scope = TensorflowGraph.get_placeholder_scope(graph, name_scopes)
    placeholder_scope = TensorflowGraph.get_placeholder_scope(graph,
                                                              name_scopes)
    with placeholder_scope:
      for task in range(self.n_tasks):
        labels.append(tf.identity(
            tf.placeholder(tf.float32, shape=[None,1],
                           name='labels_%d' % task)))
        labels.append(
            tf.identity(
                tf.placeholder(
                    tf.float32, shape=[None, 1], name='labels_%d' % task)))
    return labels

  def add_training_cost(self, graph, name_scopes, output, labels, weights):
@@ -93,8 +93,8 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
      with TensorflowGraph.shared_name_scope('costs', graph, name_scopes):
        for task in range(self.n_tasks):
          task_str = str(task).zfill(len(str(self.n_tasks)))
          with TensorflowGraph.shared_name_scope(
              'cost_{}'.format(task_str), graph, name_scopes):
          with TensorflowGraph.shared_name_scope('cost_{}'.format(task_str),
                                                 graph, name_scopes):
            with tf.name_scope('weighted'):
              weighted_cost = self.cost(output[task], labels[task],
                                        weights[task])
@@ -105,12 +105,13 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
              # non-zero weight examples in the batch.  Also, instead of using
              # tf.reduce_mean (which can put ops on the CPU) we explicitly
              # calculate with div/sum so it stays on the GPU.
              gradient_cost = tf.div(tf.reduce_sum(weighted_cost),
                                     self.batch_size)
              gradient_cost = tf.div(
                  tf.reduce_sum(weighted_cost), self.batch_size)
              gradient_costs.append(gradient_cost)

        # aggregated costs
        with TensorflowGraph.shared_name_scope('aggregated', graph, name_scopes):
        with TensorflowGraph.shared_name_scope('aggregated', graph,
                                               name_scopes):
          with tf.name_scope('gradient'):
            loss = tf.add_n(gradient_costs)

@@ -123,8 +124,8 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
      return loss

  def cost(self, logits, labels, weights):
    return tf.mul(tf.nn.sigmoid_cross_entropy_with_logits(logits, labels),
                  weights)
    return tf.mul(
        tf.nn.sigmoid_cross_entropy_with_logits(logits, labels), weights)

  def add_output_ops(self, graph, output):
    # adding output nodes of sigmoid function
@@ -152,8 +153,7 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
        orig_dict["weights_%d" % task] = w_b[:, task]
      else:
        # Dummy placeholders
        orig_dict["weights_%d" % task] = np.ones(
            (self.batch_size,)) 
        orig_dict["weights_%d" % task] = np.ones((self.batch_size,))
    return TensorflowGraph.get_feed_dict(orig_dict)

  def predict_proba_on_batch(self, X):
@@ -172,16 +172,15 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
        # transfer 2D prediction tensor to 2D x n_classes(=2) 
        complimentary = np.ones(np.shape(batch_outputs))
        complimentary = complimentary - batch_outputs
        batch_outputs = np.concatenate([complimentary, batch_outputs],
                                            axis = batch_outputs.ndim-1)
        batch_outputs = np.concatenate(
            [complimentary, batch_outputs], axis=batch_outputs.ndim - 1)
        # reshape to batch_size x n_tasks x ...
        if batch_outputs.ndim == 3:
          batch_outputs = batch_outputs.transpose((1, 0, 2))
        elif batch_outputs.ndim == 2:
          batch_outputs = batch_outputs.transpose((1, 0))
        else:
          raise ValueError(
              'Unrecognized rank combination for output: %s ' %
          raise ValueError('Unrecognized rank combination for output: %s ' %
                           (batch_outputs.shape,))

      outputs = batch_outputs
@@ -209,20 +208,19 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
        # transfer 2D prediction tensor to 2D x n_classes(=2) 
        complimentary = np.ones(np.shape(batch_output))
        complimentary = complimentary - batch_output
        batch_output = np.concatenate([complimentary, batch_output],
                                            axis = batch_output.ndim-1)
        batch_output = np.concatenate(
            [complimentary, batch_output], axis=batch_output.ndim - 1)
        # reshape to batch_size x n_tasks x ...
        if batch_output.ndim == 3:
          batch_output = batch_output.transpose((1, 0, 2))
        elif batch_output.ndim == 2:
          batch_output = batch_output.transpose((1, 0))
        else:
          raise ValueError(
              'Unrecognized rank combination for output: %s' %
          raise ValueError('Unrecognized rank combination for output: %s' %
                           (batch_output.shape,))
        output.append(batch_output)

        outputs = np.array(from_one_hot(
            np.squeeze(np.concatenate(output)), axis=-1))
        outputs = np.array(
            from_one_hot(np.squeeze(np.concatenate(output)), axis=-1))

    return np.copy(outputs)
+29 −16
Original line number Diff line number Diff line
@@ -661,12 +661,15 @@ class IRVTransformer():
    top_labels = []
    with g_temp.as_default():
      labels_tf = tf.constant(y)
      similarity_placeholder = tf.placeholder(dtype=tf.float64, shape=(None,reference_len))
      value, indice = tf.nn.top_k(similarity_placeholder, k=self.K+1, sorted=True)
      similarity_placeholder = tf.placeholder(
          dtype=tf.float64, shape=(None, reference_len))
      value, indice = tf.nn.top_k(
          similarity_placeholder, k=self.K + 1, sorted=True)
      top_label = tf.gather(labels_tf, indice)
      feed_dict = {}
      for count in range(target_len // 100 + 1):
        feed_dict[similarity_placeholder] = similarity_xs[count*100:min((count+1)*100, target_len),:]
        feed_dict[similarity_placeholder] = similarity_xs[count * 100:min((
            count + 1) * 100, target_len), :]
        with tf.Session() as sess:
          fetched_values = sess.run([value, top_label], feed_dict=feed_dict)
          values.append(fetched_values[0])
@@ -675,9 +678,14 @@ class IRVTransformer():
    top_labels = np.concatenate(top_labels, axis=0)
    for count in range(values.shape[0]):
      if values[count, 0] == 1:
        features.append(np.concatenate([values[count, 1:(self.K+1)], top_labels[count, 1:(self.K+1)]]))
        features.append(
            np.concatenate([
                values[count, 1:(self.K + 1)], top_labels[count, 1:(self.K + 1)]
            ]))
      else:
        features.append(np.concatenate([values[count, 0:self.K], top_labels[count, 0:self.K]]))
        features.append(
            np.concatenate(
                [values[count, 0:self.K], top_labels[count, 0:self.K]]))
    return features

  def X_transform(self, X_target):
@@ -701,7 +709,8 @@ class IRVTransformer():
    print('start similarity calculation')
    time1 = time.time()
    similarity = IRVTransformer.matrix_mul(X_target, np.transpose(self.X)) / (
        n_features - IRVTransformer.matrix_mul(1 - X_target, np.transpose(1 - self.X)))
        n_features - IRVTransformer.matrix_mul(1 - X_target,
                                               np.transpose(1 - self.X)))
    time2 = time.time()
    print('similarity calculation takes %i s' % (time2 - time1))
    for i in range(self.n_tasks):
@@ -721,8 +730,10 @@ class IRVTransformer():
    for X1_id in range(X1_iter):
      result = np.zeros((1,))
      for X2_id in range(X2_iter):
        partial_result = np.matmul(X1[X1_id*shard_size:min((X1_id+1)*shard_size, X1_shape[0]),:],
                                   X2[:, X2_id*shard_size:min((X2_id+1)*shard_size, X2_shape[1])])
        partial_result = np.matmul(X1[X1_id * shard_size:min((
            X1_id + 1) * shard_size, X1_shape[0]), :],
                                   X2[:, X2_id * shard_size:min((
                                       X2_id + 1) * shard_size, X2_shape[1])])
        if result.size == 1:
          result = partial_result
        else:
@@ -739,7 +750,9 @@ class IRVTransformer():
    X_length = dataset.X.shape[0]
    X_trans = []
    for count in range(X_length // 5000 + 1):
      X_trans.append(self.X_transform(dataset.X[count*5000:min((count+1)*5000,X_length), :]))
      X_trans.append(
          self.X_transform(dataset.X[count * 5000:min((count + 1) * 5000,
                                                      X_length), :]))
    X_trans = np.concatenate(X_trans, axis=0)
    return NumpyDataset(X_trans, dataset.y, dataset.w, ids=None)

+6 −3
Original line number Diff line number Diff line
@@ -56,6 +56,7 @@ from sampl.sampl_datasets import load_sampl
from clintox.clintox_datasets import load_clintox
from hiv.hiv_datasets import load_hiv


def benchmark_loading_datasets(hyper_parameters,
                               dataset='tox21',
                               model='tf',
@@ -812,8 +813,8 @@ if __name__ == '__main__':
    #irv, rf, rf_regression should be assigned manually
  if len(datasets) == 0:
    datasets = [
        'tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'hiv', 
        'sampl', 'delaney', 'nci', 'kaggle', 'pdbbind', 'chembl', 'qm7b'
        'tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'hiv', 'sampl',
        'delaney', 'nci', 'kaggle', 'pdbbind', 'chembl', 'qm7b'
    ]

  #input hyperparameters
@@ -899,7 +900,9 @@ if __name__ == '__main__':

  for split in splitters:
    for dataset in datasets:
      if dataset in ['tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'hiv']:
      if dataset in [
          'tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'hiv'
      ]:
        for model in models:
          if model in ['tf', 'tf_robust', 'logreg', 'graphconv', 'rf', 'irv']:
            benchmark_loading_datasets(
+11 −8
Original line number Diff line number Diff line
@@ -10,13 +10,13 @@ import numpy as np
import shutil
import deepchem as dc


def load_hiv(featurizer='ECFP', split='index'):
  """Load hiv datasets. Does not do train/test split"""
  # Featurize hiv dataset
  print("About to featurize hiv dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "./HIV.csv")
  dataset_file = os.path.join(current_dir, "./HIV.csv")
  hiv_tasks = ["HIV_active"]
  if featurizer == 'ECFP':
    featurizer_func = dc.feat.CircularFingerprint(size=1024)
@@ -27,16 +27,19 @@ def load_hiv(featurizer='ECFP', split='index'):
  dataset = loader.featurize(dataset_file, shard_size=8192)
  # Initialize transformers 
  transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
  ]

  print("About to transform data")
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  splitters = {'index': dc.splits.IndexSplitter(),
  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
      'scaffold': dc.splits.ScaffoldSplitter(),
               'butina': dc.splits.ButinaSplitter()}
      'butina': dc.splits.ButinaSplitter()
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  return hiv_tasks, (train, valid, test), transformers
+1 −4
Original line number Diff line number Diff line
@@ -25,10 +25,7 @@ train_dataset = transformer.transform(train_dataset)
valid_dataset = transformer.transform(valid_dataset)

model = dc.models.TensorflowMultiTaskIRVClassifier(
        len(hiv_tasks),
        K=10,
        batch_size=50,
        learning_rate=0.001)
    len(hiv_tasks), K=10, batch_size=50, learning_rate=0.001)

# Fit trained model
model.fit(train_dataset)
Loading