Commit 0c092b03 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #277 from rbharath/support_models

Updated support models for new generator API
parents 87851610 07b5932a
Loading
Loading
Loading
Loading
+17 −32
Original line number Diff line number Diff line
@@ -14,18 +14,18 @@ from datasets import load_tox21_convmol
# Number of folds for split 
K = 4 
# Depth of attention module
max_depth = 8
max_depth = 3
# number positive/negative ligands
n_pos = 1 
n_pos = 10
n_neg = 10
# Set batch sizes for network
test_batch_size = 128
support_batch_size = n_pos + n_neg
n_train_trials = 5000
nb_epochs = 1
n_train_trials = 2000
n_eval_trials = 20
n_steps_per_trial = 1
# Sample supports without replacement (all pos/neg should be different)
replace = False
learning_rate = 1e-4
log_every_n_samples = 50
# Number of features on conv-mols
n_feat = 71

@@ -46,29 +46,17 @@ test_dataset = fold_datasets[-1]
support_model = dc.nn.SequentialSupportGraph(n_feat)

# Add layers
# Need to add batch-norm separately to test/support due to differing
# shapes.
# Adding 1st layer 
# output will be (n_atoms, 64)
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add_test(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
support_model.add_support(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
support_model.add(dc.nn.GraphPool())
# Addding 2nd layer
# output will be (n_atoms, 64)
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add_test(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
support_model.add_support(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
support_model.add(dc.nn.GraphConv(128, activation='relu'))
support_model.add(dc.nn.GraphPool())
# Adding 3rd layer
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add_support(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
support_model.add_test(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.Dense(128, activation='tanh'))

support_model.add_test(dc.nn.GraphGather(test_batch_size, activation='tanh'))
support_model.add_support(dc.nn.GraphGather(support_batch_size, activation='tanh'))

# Gather into molecules
support_model.add_test(dc.nn.GraphGather(test_batch_size))
support_model.add_support(dc.nn.GraphGather(support_batch_size))
# Apply an attention lstm layer
support_model.join(dc.nn.AttnLSTMEmbedding(
    test_batch_size, support_batch_size, max_depth))
@@ -76,22 +64,19 @@ support_model.join(dc.nn.AttnLSTMEmbedding(
with tf.Session() as sess:
  model = dc.models.SupportGraphClassifier(
    sess, support_model, test_batch_size=test_batch_size,
    support_batch_size=support_batch_size,
    learning_rate=1e-3, verbosity="high")
    support_batch_size=support_batch_size, learning_rate=learning_rate,
    verbosity="high")

  ############################################################ DEBUG
  print("FIT")
  ############################################################ DEBUG
  model.fit(train_dataset, n_trials=n_train_trials,
            n_steps_per_trial=n_steps_per_trial, n_pos=n_pos,
            n_neg=n_neg, replace=False)
  model.save()

  model.fit(train_dataset, nb_epochs=nb_epochs, 
            n_episodes_per_epoch=n_train_trials,
            n_pos=n_pos, n_neg=n_neg, log_every_n_samples=log_every_n_samples)
  ############################################################ DEBUG
  print("EVAL")
  ############################################################ DEBUG
  scores = model.evaluate(
      test_dataset, metric, n_pos=n_pos, n_neg=n_neg, replace=replace,
      n_trials=n_eval_trials)
      test_dataset, metric, n_pos, n_neg, n_trials=n_eval_trials)
  print("Scores on evaluation dataset")
  print(scores)
+22 −35
Original line number Diff line number Diff line
"""
Train low-data attn models on random forests. Test last fold only.
Train low-data res models on Tox21. Test last fold only.
"""
from __future__ import print_function
from __future__ import division
@@ -12,21 +12,20 @@ import tensorflow as tf
from datasets import load_tox21_convmol

# Number of folds for split 
K = 12 
K = 4 
# Depth of attention module
max_depth = 4
max_depth = 3
# num positive/negative ligands
n_pos = 1 
n_pos = 10
n_neg = 10
# Set batch sizes for network
test_batch_size = 128
support_batch_size = n_pos + n_neg
n_train_trials = 3000 
n_eval_trials = 5
n_steps_per_trial = 1
nb_epochs = 1
n_train_trials = 2000
n_eval_trials = 20
learning_rate = 1e-4
log_every_n_samples = 50
# Sample supports without replacement (all pos/neg should be different)
replace = False
# Number of features on conv-mols
n_feat = 71

@@ -47,49 +46,37 @@ test_dataset = fold_datasets[-1]
support_model = dc.nn.SequentialSupportGraph(n_feat)

# Add layers
# 1st conv layer + batchnorm
# output will be (n_atoms, 64)
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add_test(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
support_model.add_support(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
# 2nd conv layer + batchnorm
# output will be (n_atoms, 64)
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add_test(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
support_model.add_support(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
# 3nd conv layer + batchnorm
# output will be (n_atoms, 64)
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.GraphConv(128, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add_test(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
support_model.add_support(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))

support_model.add(dc.nn.GraphPool())
support_model.add_test(dc.nn.GraphGather(test_batch_size))
support_model.add_support(dc.nn.GraphGather(support_batch_size))
support_model.add(dc.nn.Dense(128, activation='tanh'))

support_model.add_test(dc.nn.GraphGather(test_batch_size, activation='tanh'))
support_model.add_support(dc.nn.GraphGather(support_batch_size, activation='tanh'))

# Apply a residual lstm layer
support_model.join(dc.nn.ResiLSTMEmbedding(
    test_batch_size, support_batch_size, max_depth))


with tf.Session() as sess:
  model = dc.models.SupportGraphClassifier(
    sess, support_model, test_batch_size=test_batch_size,
    support_batch_size=support_batch_size,
    learning_rate=1e-3, verbosity="high")
    support_batch_size=support_batch_size, learning_rate=learning_rate,
    verbosity="high")

  ############################################################ DEBUG
  print("FIT")
  ############################################################ DEBUG
  model.old_fit(test_dataset, n_trials=n_train_trials,
            n_steps_per_trial=n_steps_per_trial, n_pos=n_pos,
            n_neg=n_neg, log_every_n_samples=log_every_n_samples, replace=False)
  model.save()

  model.fit(train_dataset, nb_epochs=nb_epochs,
            n_episodes_per_epoch=n_train_trials,
            n_pos=n_pos, n_neg=n_neg, log_every_n_samples=log_every_n_samples)
  ############################################################ DEBUG
  print("EVAL")
  ############################################################ DEBUG
  scores = model.evaluate(
      test_dataset, metric, n_pos=n_pos, n_neg=n_neg, replace=replace,
      n_trials=n_eval_trials)
      test_dataset, metric, n_pos, n_neg, n_trials=n_eval_trials)
  print("Scores on evaluation dataset")
  print(scores)
+5 −8
Original line number Diff line number Diff line
@@ -14,12 +14,10 @@ from sklearn.ensemble import RandomForestClassifier
# 4-fold splits
K = 4
# num positive/negative ligands
n_pos = 1
n_pos = 5
n_neg = 10
# 10 trials on test-set
n_trials = 10
# Sample supports without replacement (all pos/neg should be different)
replace = False
n_trials = 20

tox21_tasks, dataset, transformers = load_tox21_ecfp()

@@ -35,15 +33,14 @@ test_dataset = fold_datasets[-1]

# Get supports on test-set
support_generator = dc.data.SupportGenerator(
    test_dataset, range(len(test_dataset.get_task_names())), n_pos, n_neg,
    n_trials, replace)
    test_dataset, n_pos, n_neg, n_trials)

# Compute accuracies
task_scores = {task: [] for task in range(len(test_dataset.get_task_names()))}
for (task, support) in support_generator:
  # Train model on support
  sklearn_model = RandomForestClassifier(
      class_weight="balanced", n_estimators=50)
      class_weight="balanced", n_estimators=100)
  model = dc.models.SklearnModel(sklearn_model)
  model.fit(support)

@@ -53,7 +50,7 @@ for (task, support) in support_generator:
  y_pred = model.predict_proba(task_dataset)
  score = metric.compute_metric(
      task_dataset.y, y_pred, task_dataset.w)
  #print("Score on task %s is %s" % (str(task), str(score)))
  print("Score on task %s is %s" % (str(task), str(score)))
  task_scores[task].append(score)

# Join information for all tasks.
+19 −38
Original line number Diff line number Diff line
@@ -14,16 +14,17 @@ from datasets import load_tox21_convmol
# Number of folds for split 
K = 4 
# num positive/negative ligands
n_pos = 3
n_pos = 10
n_neg = 10
# Set batch sizes for network
test_batch_size = 100
test_batch_size = 128
support_batch_size = n_pos + n_neg
n_train_trials = 3000
nb_epochs = 1
n_train_trials = 2000
n_eval_trials = 20 
n_steps_per_trial = 1
# Sample supports without replacement (all pos/neg should be different)
replace = False
learning_rate = 1e-4
log_every_n_samples = 50
# Number of features on conv-mols
n_feat = 71

@@ -44,53 +45,33 @@ test_dataset = fold_datasets[-1]
support_model = dc.nn.SequentialSupportGraph(n_feat)

# Add layers

# Adding 1st layer
# output will be (n_atoms, 64)
support_model.add(dc.nn.GraphConv(64, activation='relu'))
# Need to add batch-norm to test/support due to differing shapes.
# output will be (n_atoms, 64)
support_model.add_test(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
support_model.add_support(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
# Addding 2nd layer
# output will be (n_atoms, 64)
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add(dc.nn.GraphPool())
# Adding 3rd layer
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add_support(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
support_model.add_test(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
# Adding 4th layer
support_model.add(dc.nn.GraphConv(128, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add_support(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
support_model.add_test(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
## Adding 5th layer
#support_model.add(dc.nn.GraphConv(64, activation='relu'))
#support_model.add_support(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
#support_model.add_test(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.Dense(128, activation='tanh'))

# Gather atoms into batches
support_model.add_test(dc.nn.GraphGather(test_batch_size))
support_model.add_support(dc.nn.GraphGather(support_batch_size))
support_model.add_test(dc.nn.GraphGather(test_batch_size, activation='tanh'))
support_model.add_support(dc.nn.GraphGather(support_batch_size, activation='tanh'))

with tf.Session() as sess:
  model = dc.models.SupportGraphClassifier(
    sess, support_model, test_batch_size=test_batch_size,
    support_batch_size=support_batch_size, learning_rate=3e-3, verbosity="high")
    support_batch_size=support_batch_size, learning_rate=learning_rate,
    verbosity="high")

  ############################################################ DEBUG
  print("FIT")
  ############################################################ DEBUG
  model.fit(train_dataset, n_trials=n_train_trials,
            n_steps_per_trial=n_steps_per_trial, n_pos=n_pos,
            n_neg=n_neg, replace=False)
  model.save()

  model.fit(train_dataset, nb_epochs=nb_epochs,
            n_episodes_per_epoch=n_train_trials,
            n_pos=n_pos, n_neg=n_neg, log_every_n_samples=log_every_n_samples)
  ############################################################ DEBUG
  print("EVAL")
  ############################################################ DEBUG
  scores = model.evaluate(
      test_dataset, metric, n_pos=n_pos, n_neg=n_neg, replace=replace,
      n_trials=n_eval_trials)
  print("Scores on held-out dataset")
      test_dataset, metric, n_pos, n_neg, n_trials=n_eval_trials)
  print("Scores on evaluation dataset")
  print(scores)