Commit 8903c2d8 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Modifications for revision

parent a963863b
Loading
Loading
Loading
Loading
+13 −15
Original line number Diff line number Diff line
@@ -16,8 +16,8 @@ K = 4
# Depth of attention module
max_depth = 3
# number positive/negative ligands
n_pos = 1
n_neg = 1
n_pos = 10
n_neg = 10
# Set batch sizes for network
test_batch_size = 128
support_batch_size = n_pos + n_neg
@@ -27,13 +27,13 @@ n_eval_trials = 20
learning_rate = 1e-4
log_every_n_samples = 50
# Number of features on conv-mols
n_feat = 71
n_feat = 75

muv_tasks, dataset, transformers = load_muv_convmol()

# Define metric
metric = dc.metrics.Metric(
    dc.metrics.roc_auc_score, verbosity="high", mode="classification")
    dc.metrics.roc_auc_score, mode="classification")

task_splitter = dc.splits.TaskSplitter()
fold_datasets = task_splitter.k_fold_split(dataset, K)
@@ -64,19 +64,17 @@ support_model.join(dc.nn.AttnLSTMEmbedding(
with tf.Session() as sess:
  model = dc.models.SupportGraphClassifier(
    sess, support_model, test_batch_size=test_batch_size,
    support_batch_size=support_batch_size, learning_rate=learning_rate,
    verbosity="high")
    support_batch_size=support_batch_size, learning_rate=learning_rate)

  ############################################################ DEBUG
  print("FIT")
  ############################################################ DEBUG
  model.fit(train_dataset, nb_epochs=nb_epochs, 
            n_episodes_per_epoch=n_train_trials,
            n_pos=n_pos, n_neg=n_neg, log_every_n_samples=log_every_n_samples)
  ############################################################ DEBUG
  print("EVAL")
  ############################################################ DEBUG
  scores = model.evaluate(
  mean_scores, std_scores = model.evaluate(
      test_dataset, metric, n_pos, n_neg, n_trials=n_eval_trials)
  print("Scores on evaluation dataset")
  print(scores)

print("Mean Scores on evaluation dataset")
print(mean_scores)
print("Standard Deviations on evaluation dataset")
print(std_scores)
print("Median of Mean Scores")
print(np.median(np.array(mean_scores.values())))
+94 −0
Original line number Diff line number Diff line
"""
Train low-data MUV models with graph-convolution. Test last fold only.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import tempfile
import numpy as np
import tensorflow as tf
import deepchem as dc
from datasets import load_muv_convmol

# 4-fold splits
K = 4
# num positive/negative ligands
n_pos = 10
n_neg = 10
# 10 trials on test-set
n_trials = 20

muv_tasks, dataset, transformers = load_muv_convmol()

# Define metric
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, mode="classification")

task_splitter = dc.splits.TaskSplitter()
fold_datasets = task_splitter.k_fold_split(dataset, K)

train_folds = fold_datasets[:-1] 
train_dataset = dc.splits.merge_fold_datasets(train_folds)
test_dataset = fold_datasets[-1]

# Get supports on test-set
support_generator = dc.data.SupportGenerator(
    test_dataset, n_pos, n_neg, n_trials)

# Compute accuracies
task_scores = {task: [] for task in range(len(test_dataset.get_task_names()))}

for trial_num, (task, support) in enumerate(support_generator):
  print("Starting trial %d" % trial_num)
  # Train model on support
  g = tf.Graph()
  sess = tf.Session(graph=g)

  # Do setup required for tf/keras models
  # Number of features on conv-mols
  n_feat = 75
  # Batch size of models
  batch_size = 50

  with g.as_default():
    graph_model = dc.nn.SequentialGraph(n_feat)
    graph_model.add(dc.nn.GraphConv(64, activation='relu'))
    graph_model.add(dc.nn.GraphPool())
    graph_model.add(dc.nn.GraphConv(128, activation='relu'))
    graph_model.add(dc.nn.GraphPool())
    graph_model.add(dc.nn.GraphConv(64, activation='relu'))
    graph_model.add(dc.nn.GraphPool())
    graph_model.add(dc.nn.Dense(128, activation='tanh'))
    graph_model.add(dc.nn.GraphGather(batch_size, activation="tanh"))

    with tf.Session() as sess:
      model = dc.models.MultitaskGraphClassifier(
        sess, graph_model, 1, batch_size=batch_size,
        learning_rate=1e-3, learning_rate_decay_time=1000,
        optimizer_type="adam", beta1=.9, beta2=.999)

      # Fit trained model
      model.fit(support, nb_epoch=10)

      # Test model
      task_dataset = dc.data.get_task_dataset_minus_support(
          test_dataset, support, task)
      y_pred = model.predict_proba(task_dataset)
      score = metric.compute_metric(
          task_dataset.y, y_pred, task_dataset.w)
      print("Score on task %s is %s" % (str(task), str(score)))
      task_scores[task].append(score)

# Join information for all tasks.
mean_task_scores = {}
std_task_scores = {}
for task in range(len(test_dataset.get_task_names())):
  mean_task_scores[task] = np.mean(np.array(task_scores[task]))
  std_task_scores[task] = np.std(np.array(task_scores[task]))

print("Mean scores")
print(mean_task_scores)
print("Standard Deviations")
print(std_task_scores)
print("Median of Mean Scores")
print(np.median(np.array(mean_task_scores.values())))
+13 −16
Original line number Diff line number Diff line
@@ -16,8 +16,8 @@ K = 4
# Depth of attention module
max_depth = 3
# num positive/negative ligands
n_pos = 1
n_neg = 1
n_pos = 10
n_neg = 10
# Set batch sizes for network
test_batch_size = 128
support_batch_size = n_pos + n_neg
@@ -27,13 +27,12 @@ n_eval_trials = 20
learning_rate = 1e-4
log_every_n_samples = 50
# Number of features on conv-mols
n_feat = 71
n_feat = 75

muv_tasks, dataset, transformers = load_muv_convmol()

# Define metric
metric = dc.metrics.Metric(
    dc.metrics.roc_auc_score, verbosity="high", mode="classification")
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, mode="classification")

task_splitter = dc.splits.TaskSplitter()
fold_datasets = task_splitter.k_fold_split(dataset, K)
@@ -64,19 +63,17 @@ support_model.join(dc.nn.ResiLSTMEmbedding(
with tf.Session() as sess:
  model = dc.models.SupportGraphClassifier(
    sess, support_model, test_batch_size=test_batch_size,
    support_batch_size=support_batch_size, learning_rate=learning_rate,
    verbosity="high")
    support_batch_size=support_batch_size, learning_rate=learning_rate)

  ############################################################ DEBUG
  print("FIT")
  ############################################################ DEBUG
  model.fit(train_dataset, nb_epochs=nb_epochs,
            n_episodes_per_epoch=n_train_trials,
            n_pos=n_pos, n_neg=n_neg, log_every_n_samples=log_every_n_samples)
  ############################################################ DEBUG
  print("EVAL")
  ############################################################ DEBUG
  scores = model.evaluate(
  mean_scores, std_scores = model.evaluate(
      test_dataset, metric, n_pos, n_neg, n_trials=n_eval_trials)
  print("Scores on evaluation dataset")
  print(scores)

print("Mean Scores on evaluation dataset")
print(mean_scores)
print("Standard Deviations on evaluation dataset")
print(std_scores)
print("Median of Mean Scores")
print(np.median(np.array(mean_scores.values())))
+10 −2
Original line number Diff line number Diff line
@@ -14,8 +14,8 @@ from sklearn.ensemble import RandomForestClassifier
# 4-fold splits
K = 4
# num positive/negative ligands
n_pos = 1
n_neg = 1
n_pos = 10
n_neg = 10
# 10 trials on test-set
n_trials = 20

@@ -55,6 +55,14 @@ for (task, support) in support_generator:

# Join information for all tasks.
mean_task_scores = {}
std_task_scores = {}
for task in range(len(test_dataset.get_task_names())):
  mean_task_scores[task] = np.mean(np.array(task_scores[task]))
  std_task_scores[task] = np.std(np.array(task_scores[task]))

print("Mean scores")
print(mean_task_scores)
print("Standard Deviations")
print(std_task_scores)
print("Median of Mean Scores")
print(np.median(np.array(mean_task_scores.values())))
+13 −16
Original line number Diff line number Diff line
@@ -13,8 +13,8 @@ from datasets import load_muv_convmol
# Number of folds for split 
K = 4 
# num positive/negative ligands
n_pos = 1
n_neg = 1
n_pos = 10
n_neg = 10
# Set batch sizes for network
test_batch_size = 128
support_batch_size = n_pos + n_neg
@@ -25,13 +25,12 @@ n_steps_per_trial = 1
learning_rate = 1e-4
log_every_n_samples = 50
# Number of features on conv-mols
n_feat = 71
n_feat = 75

muv_tasks, dataset, transformers = load_muv_convmol()

# Define metric
metric = dc.metrics.Metric(
    dc.metrics.roc_auc_score, verbosity="high", mode="classification")
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, mode="classification")

task_splitter = dc.splits.TaskSplitter()
fold_datasets = task_splitter.k_fold_split(dataset, K)
@@ -58,19 +57,17 @@ support_model.add_support(dc.nn.GraphGather(support_batch_size, activation='tanh
with tf.Session() as sess:
  model = dc.models.SupportGraphClassifier(
    sess, support_model, test_batch_size=test_batch_size,
    support_batch_size=support_batch_size, learning_rate=learning_rate,
    verbosity="high")
    support_batch_size=support_batch_size, learning_rate=learning_rate)

  ############################################################ DEBUG
  print("FIT")
  ############################################################ DEBUG
  model.fit(train_dataset, nb_epochs=nb_epochs,
            n_episodes_per_epoch=n_train_trials,
            n_pos=n_pos, n_neg=n_neg, log_every_n_samples=log_every_n_samples)
  ############################################################ DEBUG
  print("EVAL")
  ############################################################ DEBUG
  scores = model.evaluate(
  mean_scores, std_scores = model.evaluate(
      test_dataset, metric, n_pos, n_neg, n_trials=n_eval_trials)
  print("Scores on evaluation dataset")
  print(scores)

print("Mean Scores on evaluation dataset")
print(mean_scores)
print("Standard Deviations on evaluation dataset")
print(std_scores)
print("Median of Mean Scores")
print(np.median(np.array(mean_scores.values())))
Loading