Commit 96c7ab7b authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #278 from rbharath/muv_models

Support Models for MUV dataset
parents 0c092b03 bf9b19a2
Loading
Loading
Loading
Loading
+11 −5
Original line number Diff line number Diff line
@@ -58,6 +58,8 @@ class TaskSplitter(Splitter):
                             frac_test=.1):
    """Performs a train/valid/test split of the tasks for dataset.

    If split is uneven, spillover goes to test.

    Parameters
    ----------
    dataset: dc.data.Dataset
@@ -69,12 +71,11 @@ class TaskSplitter(Splitter):
    frac_test: float, optional
      Proportion of tasks to be put into test. Rounded to nearest int.
    """
    np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1)
    n_tasks = len(dataset.get_task_names())
    n_train = int(np.round(frac_train * n_tasks))
    n_valid = int(np.round(frac_valid * n_tasks))
    n_test = int(np.round(frac_test * n_tasks))
    if n_train + n_valid + n_test != n_tasks:
      raise ValueError("Train/Valid/Test fractions don't split tasks evenly.")
    n_test = n_tasks - n_train - n_valid 

    X, y, w, ids = dataset.X, dataset.y, dataset.w, dataset.ids
    
@@ -88,6 +89,8 @@ class TaskSplitter(Splitter):
  def k_fold_split(self, dataset, K):
    """Performs a K-fold split of the tasks for dataset.

    If split is uneven, spillover goes to last fold.

    Parameters
    ----------
    dataset: dc.data.Dataset
@@ -98,13 +101,16 @@ class TaskSplitter(Splitter):
    n_tasks = len(dataset.get_task_names())
    n_per_fold = int(np.round(n_tasks/float(K)))
    if K * n_per_fold != n_tasks:
      raise ValueError("Cannot perform a valid %d-way split" % K)
      print("Assigning extra tasks to last fold due to uneven split")
    
    X, y, w, ids = dataset.X, dataset.y, dataset.w, dataset.ids

    fold_datasets = []
    for fold in range(K):
      if fold != K-1:
        fold_tasks = range(fold*n_per_fold, (fold+1)*n_per_fold)
      else:
        fold_tasks = range(fold*n_per_fold, n_tasks)
      fold_datasets.append(
          NumpyDataset(X, y[:, fold_tasks], w[:, fold_tasks], ids))
    return fold_datasets
+42 −0
Original line number Diff line number Diff line
@@ -58,6 +58,48 @@ class TestTaskSplitters(unittest.TestCase):
    for fold_dataset in fold_datasets:
      assert len(fold_dataset.get_task_names()) == 2

  def test_uneven_k_fold_split(self):
    """
    Test k-fold-split works when K does not divide n_tasks.
    """
    n_samples = 100
    n_features = 10
    n_tasks = 17
    X = np.random.rand(n_samples, n_features)
    p = .05 # proportion actives
    y = np.random.binomial(1, p, size=(n_samples, n_tasks))
    dataset = dc.data.NumpyDataset(X, y)
    K = 4
    task_splitter = dc.splits.TaskSplitter()
    fold_datasets = task_splitter.k_fold_split(dataset, K)

    for fold in range(K-1):
      fold_dataset = fold_datasets[fold]
      assert len(fold_dataset.get_task_names()) == 4
    assert len(fold_datasets[-1].get_task_names()) == 5


  def test_uneven_train_valid_test_split(self):
    """
    Test train/valid/test split works when proportions don't divide n_tasks.
    """
    n_samples = 100
    n_features = 10
    n_tasks = 11
    X = np.random.rand(n_samples, n_features)
    p = .05 # proportion actives
    y = np.random.binomial(1, p, size=(n_samples, n_tasks))
    dataset = dc.data.NumpyDataset(X, y)

    task_splitter = dc.splits.TaskSplitter()
    train, valid, test = task_splitter.train_valid_test_split(
        dataset, frac_train=.4, frac_valid=.3, frac_test=.3)

    assert len(train.get_task_names()) == 4
    assert len(valid.get_task_names()) == 3
    # Note that the extra task goes to test
    assert len(test.get_task_names()) == 4

  def test_merge_fold_datasets(self):
    """
    Test that (K-1) folds can be merged into train dataset.
+60 −7
Original line number Diff line number Diff line
@@ -12,13 +12,9 @@ import tempfile
import numpy as np
import deepchem as dc

#sys.path.append("..")
#from muv.muv_datasets import load_muv

def load_tox21_ecfp(num_train=7200):
  """Load Tox21 datasets. Does not do train/test split"""
  # Set some global variables up top
  verbosity = "high"
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "../../datasets/tox21.csv.gz")
@@ -31,7 +27,7 @@ def load_tox21_ecfp(num_train=7200):

  loader = dc.load.DataLoader(
      tasks=tox21_tasks, smiles_field="smiles", featurizer=featurizer,
      verbosity=verbosity)
      verbosity="high")
  dataset = loader.featurize(
      dataset_file, shard_size=8192)

@@ -48,7 +44,6 @@ def load_tox21_ecfp(num_train=7200):
def load_tox21_convmol(base_dir=None, num_train=7200):
  """Load Tox21 datasets. Does not do train/test split"""
  # Set some global variables up top
  verbosity = "high"
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "../../datasets/tox21.csv.gz")
@@ -62,7 +57,7 @@ def load_tox21_convmol(base_dir=None, num_train=7200):

  loader = dc.load.DataLoader(
      tasks=tox21_tasks, smiles_field="smiles",
      featurizer=featurizer, verbosity=verbosity)
      featurizer=featurizer, verbosity="high")
  dataset = loader.featurize(
      dataset_file, shard_size=8192)

@@ -75,3 +70,61 @@ def load_tox21_convmol(base_dir=None, num_train=7200):
    dataset = transformer.transform(dataset)

  return tox21_tasks, dataset, transformers

def load_muv_ecfp():
  """Load MUV datasets. Does not do train/test split"""
  # Load MUV dataset
  print("About to load MUV dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "../../datasets/muv.csv.gz")
  # Featurize MUV dataset
  print("About to featurize MUV dataset.")
  featurizer = dc.feat.CircularFingerprint(size=1024)
  MUV_tasks = sorted(['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644',
                      'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712',
                      'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652',
                      'MUV-466', 'MUV-832'])

  loader = dc.load.DataLoader(
      tasks=MUV_tasks, smiles_field="smiles",
      featurizer=featurizer, verbosity="high")
  dataset = loader.featurize(dataset_file)

  # Initialize transformers 
  transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
  print("About to transform data")
  for transformer in transformers:
      dataset = transformer.transform(dataset)

  return MUV_tasks, dataset, transformers

def load_muv_convmol():
  """Load MUV datasets. Does not do train/test split"""
  # Load MUV dataset
  print("About to load MUV dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "../../datasets/muv.csv.gz")
  # Featurize MUV dataset
  print("About to featurize MUV dataset.")
  featurizer = dc.feat.ConvMolFeaturizer()
  MUV_tasks = sorted(['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644',
                      'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712',
                      'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652',
                      'MUV-466', 'MUV-832'])

  loader = dc.load.DataLoader(
      tasks=MUV_tasks, smiles_field="smiles",
      featurizer=featurizer, verbosity="high")
  dataset = loader.featurize(dataset_file)

  # Initialize transformers 
  transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
  print("About to transform data")
  for transformer in transformers:
      dataset = transformer.transform(dataset)

  return MUV_tasks, dataset, transformers
+82 −0
Original line number Diff line number Diff line
"""
Train low-data attn models on MUV. Test last fold only.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import tempfile
import numpy as np
import deepchem as dc
import tensorflow as tf
from datasets import load_muv_convmol

# Number of folds for split 
K = 4 
# Depth of attention module
max_depth = 3
# number positive/negative ligands
n_pos = 1
n_neg = 5
# Set batch sizes for network
test_batch_size = 128
support_batch_size = n_pos + n_neg
nb_epochs = 1
n_train_trials = 2000
n_eval_trials = 20
learning_rate = 1e-4
log_every_n_samples = 50
# Number of features on conv-mols
n_feat = 71

muv_tasks, dataset, transformers = load_muv_convmol()

# Define metric
metric = dc.metrics.Metric(
    dc.metrics.roc_auc_score, verbosity="high", mode="classification")

task_splitter = dc.splits.TaskSplitter()
fold_datasets = task_splitter.k_fold_split(dataset, K)

train_folds = fold_datasets[:-1] 
train_dataset = dc.splits.merge_fold_datasets(train_folds)
test_dataset = fold_datasets[-1]

# Train support model on train
support_model = dc.nn.SequentialSupportGraph(n_feat)

# Add layers
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.GraphConv(128, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.Dense(128, activation='tanh'))

support_model.add_test(dc.nn.GraphGather(test_batch_size, activation='tanh'))
support_model.add_support(dc.nn.GraphGather(support_batch_size, activation='tanh'))

# Apply an attention lstm layer
support_model.join(dc.nn.AttnLSTMEmbedding(
    test_batch_size, support_batch_size, max_depth))

with tf.Session() as sess:
  model = dc.models.SupportGraphClassifier(
    sess, support_model, test_batch_size=test_batch_size,
    support_batch_size=support_batch_size, learning_rate=learning_rate,
    verbosity="high")

  ############################################################ DEBUG
  print("FIT")
  ############################################################ DEBUG
  model.fit(train_dataset, nb_epochs=nb_epochs, 
            n_episodes_per_epoch=n_train_trials,
            n_pos=n_pos, n_neg=n_neg, log_every_n_samples=log_every_n_samples)
  ############################################################ DEBUG
  print("EVAL")
  ############################################################ DEBUG
  scores = model.evaluate(
      test_dataset, metric, n_pos, n_neg, n_trials=n_eval_trials)
  print("Scores on evaluation dataset")
  print(scores)
+82 −0
Original line number Diff line number Diff line
"""
Train low-data res models on MUV. Test last fold only.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import tempfile
import numpy as np
import deepchem as dc
import tensorflow as tf
from datasets import load_muv_convmol

# Number of folds for split 
K = 4 
# Depth of attention module
max_depth = 3
# num positive/negative ligands
n_pos = 1
n_neg = 5
# Set batch sizes for network
test_batch_size = 128
support_batch_size = n_pos + n_neg
nb_epochs = 1
n_train_trials = 2000
n_eval_trials = 20
learning_rate = 1e-4
log_every_n_samples = 50
# Number of features on conv-mols
n_feat = 71

muv_tasks, dataset, transformers = load_muv_convmol()

# Define metric
metric = dc.metrics.Metric(
    dc.metrics.roc_auc_score, verbosity="high", mode="classification")

task_splitter = dc.splits.TaskSplitter()
fold_datasets = task_splitter.k_fold_split(dataset, K)

train_folds = fold_datasets[:-1] 
train_dataset = dc.splits.merge_fold_datasets(train_folds)
test_dataset = fold_datasets[-1]

# Train support model on train
support_model = dc.nn.SequentialSupportGraph(n_feat)

# Add layers
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.GraphConv(128, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.Dense(128, activation='tanh'))

support_model.add_test(dc.nn.GraphGather(test_batch_size, activation='tanh'))
support_model.add_support(dc.nn.GraphGather(support_batch_size, activation='tanh'))

# Apply a residual lstm layer
support_model.join(dc.nn.ResiLSTMEmbedding(
    test_batch_size, support_batch_size, max_depth))

with tf.Session() as sess:
  model = dc.models.SupportGraphClassifier(
    sess, support_model, test_batch_size=test_batch_size,
    support_batch_size=support_batch_size, learning_rate=learning_rate,
    verbosity="high")

  ############################################################ DEBUG
  print("FIT")
  ############################################################ DEBUG
  model.fit(train_dataset, nb_epochs=nb_epochs,
            n_episodes_per_epoch=n_train_trials,
            n_pos=n_pos, n_neg=n_neg, log_every_n_samples=log_every_n_samples)
  ############################################################ DEBUG
  print("EVAL")
  ############################################################ DEBUG
  scores = model.evaluate(
      test_dataset, metric, n_pos, n_neg, n_trials=n_eval_trials)
  print("Scores on evaluation dataset")
  print(scores)
Loading