Commit 17f082bb authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Changes for experiments

parent 724a4fac
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -340,7 +340,7 @@ class SupportGraphClassifier(Model):
    feed_dict = self.construct_feed_dict(padded_test_batch, support)
    # Get scores
    pred, scores = self.sess.run([self.pred_op, self.scores_op], feed_dict=feed_dict)
    y_pred_batch = np.round(scores)
    y_pred_batch = np.round(pred)
    ########################################################### DEBUG
    # Remove padded elements
    y_pred_batch = y_pred_batch[:n_samples]
+66 −0
Original line number Diff line number Diff line
@@ -12,6 +12,10 @@ import tempfile
import numpy as np
import deepchem as dc

def to_numpy_dataset(dataset):
  """Converts dataset to numpy dataset."""
  return dc.data.NumpyDataset(dataset.X, dataset.y, dataset.w, dataset.ids)

def load_tox21_ecfp(num_train=7200):
  """Load Tox21 datasets. Does not do train/test split"""
  # Set some global variables up top
@@ -128,3 +132,65 @@ def load_muv_convmol():
      dataset = transformer.transform(dataset)

  return MUV_tasks, dataset, transformers

def load_sider_ecfp():
  """Load SIDER datasets. Does not do train/test split"""
  # Featurize SIDER dataset
  print("About to featurize SIDER dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "../sider/sider.csv.gz")
  featurizer = dc.feat.CircularFingerprint(size=1024)

  dataset = dc.utils.save.load_from_disk(dataset_file)
  SIDER_tasks = dataset.columns.values[1:].tolist()
  print("SIDER tasks: %s" % str(SIDER_tasks))
  print("%d tasks in total" % len(SIDER_tasks))


  loader = dc.load.DataLoader(tasks=SIDER_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity="high")
  dataset = loader.featurize(dataset_file)
  print("%d datapoints in SIDER dataset" % len(dataset))

  # Initialize transformers
  transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
  print("About to transform data")
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  return SIDER_tasks, dataset, transformers

def load_sider_convmol():
  """Load SIDER datasets. Does not do train/test split"""
  # Featurize SIDER dataset
  print("About to featurize SIDER dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "../sider/sider.csv.gz")
  featurizer = dc.feat.ConvMolFeaturizer()

  dataset = dc.utils.save.load_from_disk(dataset_file)
  SIDER_tasks = dataset.columns.values[1:].tolist()
  print("SIDER tasks: %s" % str(SIDER_tasks))
  print("%d tasks in total" % len(SIDER_tasks))


  loader = dc.load.DataLoader(tasks=SIDER_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity="high")
  dataset = loader.featurize(dataset_file, debug=True)
  print("%d datapoints in SIDER dataset" % len(dataset))

  # Initialize transformers
  transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
  print("About to transform data")
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  return SIDER_tasks, dataset, transformers
+1 −1
Original line number Diff line number Diff line
@@ -17,7 +17,7 @@ K = 4
max_depth = 3
# number positive/negative ligands
n_pos = 1
n_neg = 5
n_neg = 1
# Set batch sizes for network
test_batch_size = 128
support_batch_size = n_pos + n_neg
+83 −0
Original line number Diff line number Diff line
"""
Train low-data attn models on MUV. Test last fold only.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import tempfile
import numpy as np
import deepchem as dc
import tensorflow as tf
from datasets import load_muv_convmol

# Number of folds for split 
K = 4 
# Depth of attention module
max_depth = 3
# number positive/negative ligands
n_pos = 10
n_neg = 10
# Set batch sizes for network
test_batch_size = 128
support_batch_size = n_pos + n_neg
nb_epochs = 1
n_train_trials = 2000
n_eval_trials = 20
learning_rate = 1e-4
log_every_n_samples = 50
# Number of features on conv-mols
n_feat = 71

muv_tasks, dataset, transformers = load_muv_convmol()

# Define metric
metric = dc.metrics.Metric(
    dc.metrics.roc_auc_score, verbosity="high", mode="classification")

task_splitter = dc.splits.TaskSplitter()
fold_datasets = task_splitter.k_fold_split(dataset, K)

train_folds = fold_datasets[:-1] 
train_dataset = dc.splits.merge_fold_datasets(train_folds)
test_dataset = fold_datasets[-1]

# Train support model on train
support_model = dc.nn.SequentialSupportGraph(n_feat)

# Add layers
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.GraphConv(128, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.Dense(128, activation='tanh'))

support_model.add_test(dc.nn.GraphGather(test_batch_size, activation='tanh'))
support_model.add_support(dc.nn.GraphGather(support_batch_size, activation='tanh'))

# Apply an attention lstm layer
support_model.join(dc.nn.AttnLSTMEmbedding(
    test_batch_size, support_batch_size, max_depth))

with tf.Session() as sess:
  model = dc.models.SupportGraphClassifier(
    sess, support_model, test_batch_size=test_batch_size,
    support_batch_size=support_batch_size, learning_rate=learning_rate,
    verbosity="high")

  # Turning off training to investigate
  ############################################################# DEBUG
  #print("FIT")
  ############################################################# DEBUG
  #model.fit(train_dataset, nb_epochs=nb_epochs, 
  #          n_episodes_per_epoch=n_train_trials,
  #          n_pos=n_pos, n_neg=n_neg, log_every_n_samples=log_every_n_samples)
  ############################################################ DEBUG
  print("EVAL")
  ############################################################ DEBUG
  scores = model.evaluate(
      test_dataset, metric, n_pos, n_neg, n_trials=n_eval_trials)
  print("Scores on evaluation dataset")
  print(scores)
+1 −1
Original line number Diff line number Diff line
@@ -17,7 +17,7 @@ K = 4
max_depth = 3
# num positive/negative ligands
n_pos = 1
n_neg = 5
n_neg = 1
# Set batch sizes for network
test_batch_size = 128
support_batch_size = n_pos + n_neg
Loading