Commit fc37366f authored by miaecle's avatar miaecle
Browse files

add in hiv

parent f639f956
Loading
Loading
Loading
Loading
+40 −12
Original line number Diff line number Diff line
@@ -8,8 +8,9 @@ from __future__ import unicode_literals
import os

import numpy as np

import time
import deepchem as dc
import tensorflow as tf
from deepchem.data import NumpyDataset


@@ -654,14 +655,29 @@ class IRVTransformer():
    """
    features = []
    similarity_xs = similarity * np.sign(w)
    for similarity_x in similarity_xs:
      pair = list(zip(similarity_x, range(len(similarity_x))))
      pair.sort(key=lambda x: x[0], reverse=True)
      if pair[0][0] >= 1:
        pair = pair[1:self.K + 1]
    [target_len, reference_len] = similarity_xs.shape
    g_temp = tf.Graph()
    values = []
    top_labels = []
    with g_temp.as_default():
      labels_tf = tf.constant(y)
      similarity_placeholder = tf.placeholder(dtype=tf.float64, shape=(None,reference_len))
      value, indice = tf.nn.top_k(similarity_placeholder, k=self.K+1, sorted=True)
      top_label = tf.gather(labels_tf, indice)
      feed_dict = {}
      for count in range(target_len//100+1):
        feed_dict[similarity_placeholder] = similarity_xs[count*100:min((count+1)*100, target_len),:]
        with tf.Session() as sess:
          fetched_values = sess.run([value, top_label], feed_dict=feed_dict)
          values.append(fetched_values[0])
          top_labels.append(fetched_values[1])
    values = np.concatenate(values, axis=0)
    top_labels = np.concatenate(top_labels, axis=0)
    for count in range(values.shape[0]):
      if values[count,0] == 1:
        features.append(np.concatenate([values[count, 1:(self.K+1)], top_labels[count, 1:(self.K+1)]]))
      else:
        pair = pair[:self.K]
      features.append([z[0] for z in pair] + [y[int(z[1])] for z in pair])
        features.append(np.concatenate([values[count, 0:self.K], top_labels[count, 0:self.K]]))
    return features

  def X_transform(self, X_target):
@@ -682,14 +698,20 @@ class IRVTransformer():
    """
    X_target2 = []
    n_features = X_target.shape[1]
    similarity = matrix_mul(X_target, np.transpose(self.X)) / (
        n_features - matrix_mul(1 - X_target, np.transpose(1 - self.X)))
    print('start similarity calculation')
    time1 = time.time()
    similarity = IRVTransformer.matrix_mul(X_target, np.transpose(self.X)) / (
        n_features - IRVTransformer.matrix_mul(1 - X_target, np.transpose(1 - self.X)))
    time2 = time.time()
    print('similarity calculation takes %i s' % (time2-time1))
    for i in range(self.n_tasks):
      X_target2.append(self.realize(similarity, self.y[:, i], self.w[:, i]))
    return np.concatenate([z for z in np.array(X_target2)], axis=1)

  @staticmethod
  def matrix_mul(X1, X2, shard_size=1000):
  def matrix_mul(X1, X2, shard_size=5000):
    X1 = np.float_(X1)
    X2 = np.float_(X2)
    X1_shape = X1.shape
    X2_shape = X2.shape
    assert X1_shape[1] == X2_shape[0]
@@ -705,14 +727,20 @@ class IRVTransformer():
          result = partial_result
        else:
          result = np.concatenate((result, partial_result), axis=1)
        del partial_result
      if all_result.size == 1:
        all_result = result
      else:
        all_result = np.concatenate((all_result, result), axis=0)
      del result
    return all_result    

  def transform(self, dataset):
    X_trans = self.X_transform(dataset.X)
    X_length = dataset.X.shape[0]
    X_trans = []
    for count in range(X_length//5000+1):
      X_trans.append(self.X_transform(dataset.X[count*5000:min((count+1)*5000,X_length), :]))
    X_trans = np.concatenate(X_trans, axis=0)
    return NumpyDataset(X_trans, dataset.y, dataset.w, ids=None)

  def untransform(self, z):
+10 −9
Original line number Diff line number Diff line
@@ -12,7 +12,7 @@ Giving classification performances of:
    RobustMultitaskDNN(tf_robust),
    Logistic regression(logreg), IRV(irv)
    Graph convolution(graphconv)                 
on datasets: muv, pcba, tox21, sider, toxcast, clintox
on datasets: muv, pcba, tox21, sider, toxcast, clintox, hiv

Giving regression performances of:
    MultitaskDNN(tf_regression),
@@ -54,7 +54,7 @@ from chembl.chembl_datasets import load_chembl
from qm7.qm7_datasets import load_qm7_from_mat, load_qm7b_from_mat
from sampl.sampl_datasets import load_sampl
from clintox.clintox_datasets import load_clintox

from hiv.hiv_datasets import load_hiv

def benchmark_loading_datasets(hyper_parameters,
                               dataset='tox21',
@@ -72,7 +72,7 @@ def benchmark_loading_datasets(hyper_parameters,
      hyper parameters including layer size, dropout, learning rate, etc.
  dataset: string, optional (default='tox21')
      choice of which dataset to use, should be: tox21, muv, sider, 
      toxcast, pcba, delaney, kaggle, nci, clintox, pdbbind, chembl,
      toxcast, pcba, delaney, kaggle, nci, clintox, hiv, pdbbind, chembl,
      qm7, qm7b, sampl
  model: string,  optional (default='tf')
      choice of which model to use, should be: rf, tf, tf_robust, logreg,
@@ -83,7 +83,7 @@ def benchmark_loading_datasets(hyper_parameters,
      path of result file
  """

  if dataset in ['muv', 'pcba', 'tox21', 'sider', 'toxcast', 'clintox']:
  if dataset in ['muv', 'pcba', 'tox21', 'sider', 'toxcast', 'clintox', 'hiv']:
    mode = 'classification'
  elif dataset in [
      'kaggle', 'delaney', 'nci', 'pdbbind', 'chembl', 'qm7', 'qm7b', 'sampl'
@@ -151,7 +151,8 @@ def benchmark_loading_datasets(hyper_parameters,
      'qm7': load_qm7_from_mat,
      'qm7b': load_qm7b_from_mat,
      'sampl': load_sampl,
      'clintox': load_clintox
      'clintox': load_clintox,
      'hiv': load_hiv
  }

  print('-------------------------------------')
@@ -786,7 +787,7 @@ if __name__ == '__main__':
      dest='dataset_args',
      default=[],
      help='Choice of dataset: tox21, sider, muv, toxcast, pcba, ' +
      'kaggle, delaney, nci, pdbbind, chembl, sampl, qm7, qm7b, clintox')
      'kaggle, delaney, nci, pdbbind, chembl, sampl, qm7, qm7b, clintox, hiv')
  parser.add_argument(
      '-t',
      action='store_true',
@@ -811,8 +812,8 @@ if __name__ == '__main__':
    #irv, rf, rf_regression should be assigned manually
  if len(datasets) == 0:
    datasets = [
        'tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'sampl',
        'delaney', 'nci', 'kaggle', 'pdbbind', 'chembl', 'qm7b'
        'tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'hiv', 
        'sampl', 'delaney', 'nci', 'kaggle', 'pdbbind', 'chembl', 'qm7b'
    ]

  #input hyperparameters
@@ -898,7 +899,7 @@ if __name__ == '__main__':

  for split in splitters:
    for dataset in datasets:
      if dataset in ['tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox']:
      if dataset in ['tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'hiv']:
        for model in models:
          if model in ['tf', 'tf_robust', 'logreg', 'graphconv', 'rf', 'irv']:
            benchmark_loading_datasets(

examples/hiv/HIV.csv

0 → 100644
+41914 −0

File added.

Preview size limit exceeded, changes collapsed.

+0 −0

Empty file added.

+42 −0
Original line number Diff line number Diff line
"""
hiv dataset loader.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import shutil
import deepchem as dc

def load_hiv(featurizer='ECFP', split='index'):
  """Load hiv datasets. Does not do train/test split"""
  # Featurize hiv dataset
  print("About to featurize hiv dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "./HIV.csv")
  hiv_tasks = ["HIV_active"]
  if featurizer == 'ECFP':
    featurizer_func = dc.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer_func = dc.feat.ConvMolFeaturizer()
  loader = dc.data.CSVLoader(
      tasks=hiv_tasks, smiles_field="smiles", featurizer=featurizer_func)
  dataset = loader.featurize(dataset_file, shard_size=8192)
  # Initialize transformers 
  transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]

  print("About to transform data")
  for transformer in transformers:
      dataset = transformer.transform(dataset)

  splitters = {'index': dc.splits.IndexSplitter(),
               'random': dc.splits.RandomSplitter(),
               'scaffold': dc.splits.ScaffoldSplitter(),
               'butina': dc.splits.ButinaSplitter()}
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  return hiv_tasks, (train, valid, test), transformers
Loading