Commit ffd623ab authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Sider from tox

parent 17f082bb
Loading
Loading
Loading
Loading
+41 −0
Original line number Diff line number Diff line
@@ -9,6 +9,47 @@ import time
import numpy as np
from deepchem.data import NumpyDataset

def remove_dead_examples(dataset):
  """Removes compounds with no weight.

  Parameters
  ----------
  dataset: dc.data.Dataset
    Source dataset.
  """
  w = dataset.w
  nonzero_inds = np.nonzero(np.sum(w, axis=1))

  # Remove support indices
  X = dataset.X[nonzero_inds]
  y = dataset.y[nonzero_inds]
  w = dataset.w[nonzero_inds]
  ids = dataset.ids[nonzero_inds]

  return NumpyDataset(X, y, w, ids)

def dataset_difference(dataset, remove):
  """Removes the compounds in remove from dataset.

  Parameters
  ----------
  dataset: dc.data.Dataset
    Source dataset.
  remove: dc.data.Dataset
    Dataset whose overlap will be removed.
  """
  remove_ids = set(remove.ids)
  keep_inds = [ind for ind in range(len(dataset))
               if dataset.ids[ind] not in remove_ids]

  # Remove support indices
  X = dataset.X[keep_inds]
  y = dataset.y[keep_inds]
  w = dataset.w[keep_inds]
  ids = dataset.ids[keep_inds]

  return NumpyDataset(X, y, w, ids)

def get_task_dataset_minus_support(dataset, support, task):
  """Gets data for specified task, minus support points.

+81 −0
Original line number Diff line number Diff line
@@ -19,6 +19,27 @@ class TestSupports(unittest.TestCase):
  Test that support generation happens properly.
  """

  def test_remove_dead_examples(self):
    """Tests that examples with zero weight are removed."""
    n_samples = 100
    n_features = 3
    n_tasks = 1
    
    # Generate dummy dataset
    np.random.seed(123)
    p = .05
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.binomial(1, p, size=(n_samples, n_tasks))
    w = np.random.binomial(1, p, size=(n_samples, n_tasks))

    num_nonzero = np.count_nonzero(np.sum(w, axis=1))
  
    dataset = dc.data.NumpyDataset(X, y, w, ids)
  
    cleared_dataset = dc.data.remove_dead_examples(dataset)
    assert len(cleared_dataset) == num_nonzero

  def test_get_task_support_simple(self):
    """Tests that get_task_support samples correctly."""
    n_samples = 20
@@ -181,6 +202,34 @@ class TestSupports(unittest.TestCase):
    np.testing.assert_array_equal(task_dataset.w, w[n_support:]) 
    np.testing.assert_array_equal(task_dataset.ids, ids[n_support:]) 

  def test_dataset_difference_simple(self):
    """Test that fixed index can be removed from dataset."""
    n_samples = 20
    n_remove = 5
    n_features = 3
    n_tasks = 1
    
    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.randint(2, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    dataset = dc.data.NumpyDataset(X, y, w, ids)

    remove_dataset = dc.data.NumpyDataset(X[:n_remove], y[:n_remove],
                                          w[:n_remove], ids[:n_remove])

    out_dataset = dc.data.dataset_difference(
        dataset, remove_dataset)

    # Assert all remove elements have been removed
    assert len(out_dataset) == n_samples - n_remove
    np.testing.assert_array_equal(out_dataset.X, X[n_remove:]) 
    np.testing.assert_array_equal(out_dataset.y, y[n_remove:]) 
    np.testing.assert_array_equal(out_dataset.w, w[n_remove:]) 
    np.testing.assert_array_equal(out_dataset.ids, ids[n_remove:]) 

  def test_get_task_minus_support(self):
    """Test that random index support can be removed from dataset."""
    n_samples = 10
@@ -212,6 +261,38 @@ class TestSupports(unittest.TestCase):
    np.testing.assert_array_equal(task_dataset.w, w[data_inds]) 
    np.testing.assert_array_equal(task_dataset.ids, ids[data_inds]) 

  def test_dataset_difference(self):
    """Test that random index can be removed from dataset."""
    n_samples = 10
    n_remove = 4 
    n_features = 3
    n_tasks = 1
    
    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.randint(2, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    dataset = dc.data.NumpyDataset(X, y, w, ids)

    remove_inds = sorted(np.random.choice(
        np.arange(n_samples), (n_remove,), replace=False))
    remove_dataset = dc.data.NumpyDataset(X[remove_inds], y[remove_inds],
                                          w[remove_inds], ids[remove_inds])

    out_dataset = dc.data.dataset_difference(
        dataset, remove_dataset)

    # Assert all remove elements have been removed
    data_inds = sorted(list(set(range(n_samples)) - set(remove_inds)))
    assert len(out_dataset) == n_samples - n_remove
    np.testing.assert_array_equal(out_dataset.X, X[data_inds]) 
    np.testing.assert_array_equal(out_dataset.y, y[data_inds]) 
    np.testing.assert_array_equal(out_dataset.w, w[data_inds]) 
    np.testing.assert_array_equal(out_dataset.ids, ids[data_inds]) 


  def test_get_task_minus_support_missing(self):
    """Test that support can be removed from dataset with missing data"""
    n_samples = 20
+80 −0
Original line number Diff line number Diff line
"""
Train low-data attn models on Tox21. Test on SIDER. Test last fold only.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import tempfile
import numpy as np
import deepchem as dc
import tensorflow as tf
from datasets import load_sider_convmol
from datasets import load_tox21_convmol
from datasets import to_numpy_dataset

# Number of folds for split 
K = 4 
# Depth of attention module
max_depth = 3
# number positive/negative ligands
n_pos = 10
n_neg = 10
# Set batch sizes for network
test_batch_size = 128
support_batch_size = n_pos + n_neg
nb_epochs = 1
n_train_trials = 2000
n_eval_trials = 20
learning_rate = 1e-4
log_every_n_samples = 50
# Number of features on conv-mols
n_feat = 71

sider_tasks, sider_dataset, _ = load_sider_convmol()
sider_dataset = to_numpy_dataset(sider_dataset)
tox21_tasks, tox21_dataset, _ = load_tox21_convmol()
tox21_dataset = to_numpy_dataset(tox21_dataset)

# Define metric
metric = dc.metrics.Metric(
    dc.metrics.roc_auc_score, verbosity="high", mode="classification")

# Train support model on train
support_model = dc.nn.SequentialSupportGraph(n_feat)

# Add layers
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.GraphConv(128, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.Dense(128, activation='tanh'))

support_model.add_test(dc.nn.GraphGather(test_batch_size, activation='tanh'))
support_model.add_support(dc.nn.GraphGather(support_batch_size, activation='tanh'))

# Apply an attention lstm layer
support_model.join(dc.nn.AttnLSTMEmbedding(
    test_batch_size, support_batch_size, max_depth))

with tf.Session() as sess:
  model = dc.models.SupportGraphClassifier(
    sess, support_model, test_batch_size=test_batch_size,
    support_batch_size=support_batch_size, learning_rate=learning_rate,
    verbosity="high")

  ############################################################ DEBUG
  print("FIT")
  ############################################################ DEBUG
  model.fit(tox21_dataset, nb_epochs=nb_epochs, 
            n_episodes_per_epoch=n_train_trials,
            n_pos=n_pos, n_neg=n_neg, log_every_n_samples=log_every_n_samples)
  ############################################################ DEBUG
  print("EVAL")
  ############################################################ DEBUG
  scores = model.evaluate(
      sider_dataset, metric, n_pos, n_neg, n_trials=n_eval_trials)
  print("Scores on evaluation dataset")
  print(scores)
+80 −0
Original line number Diff line number Diff line
"""
Train low-data res models on Tox21. Test on SIDER. Test last fold only.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import tempfile
import numpy as np
import deepchem as dc
import tensorflow as tf
from datasets import load_sider_convmol
from datasets import load_tox21_convmol
from datasets import to_numpy_dataset

# Number of folds for split 
K = 4 
# Depth of attention module
max_depth = 3
# num positive/negative ligands
n_pos = 10
n_neg = 10
# Set batch sizes for network
test_batch_size = 128
support_batch_size = n_pos + n_neg
nb_epochs = 1
n_train_trials = 2000
n_eval_trials = 20
learning_rate = 1e-4
log_every_n_samples = 50
# Number of features on conv-mols
n_feat = 71

sider_tasks, sider_dataset, _ = load_sider_convmol()
sider_dataset = to_numpy_dataset(sider_dataset)
tox21_tasks, tox21_dataset, _ = load_tox21_convmol()
tox21_dataset = to_numpy_dataset(tox21_dataset)

# Define metric
metric = dc.metrics.Metric(
    dc.metrics.roc_auc_score, verbosity="high", mode="classification")

# Train support model on train
support_model = dc.nn.SequentialSupportGraph(n_feat)

# Add layers
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.GraphConv(128, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.Dense(128, activation='tanh'))

support_model.add_test(dc.nn.GraphGather(test_batch_size, activation='tanh'))
support_model.add_support(dc.nn.GraphGather(support_batch_size, activation='tanh'))

# Apply a residual lstm layer
support_model.join(dc.nn.ResiLSTMEmbedding(
    test_batch_size, support_batch_size, max_depth))

with tf.Session() as sess:
  model = dc.models.SupportGraphClassifier(
    sess, support_model, test_batch_size=test_batch_size,
    support_batch_size=support_batch_size, learning_rate=learning_rate,
    verbosity="high")

  ############################################################ DEBUG
  print("FIT")
  ############################################################ DEBUG
  model.fit(tox21_dataset, nb_epochs=nb_epochs,
            n_episodes_per_epoch=n_train_trials,
            n_pos=n_pos, n_neg=n_neg, log_every_n_samples=log_every_n_samples)
  ############################################################ DEBUG
  print("EVAL")
  ############################################################ DEBUG
  scores = model.evaluate(
      sider_dataset, metric, n_pos, n_neg, n_trials=n_eval_trials)
  print("Scores on evaluation dataset")
  print(scores)
+74 −0
Original line number Diff line number Diff line
"""
Train low-data siamese models on Tox21. Test on SIDER. Test last fold only.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import numpy as np
import deepchem as dc
import tensorflow as tf
from datasets import load_sider_convmol
from datasets import load_tox21_convmol
from datasets import to_numpy_dataset

# Number of folds for split 
K = 4 
# num positive/negative ligands
n_pos = 10
n_neg = 10
# Set batch sizes for network
test_batch_size = 128
support_batch_size = n_pos + n_neg
nb_epochs = 1
n_train_trials = 2000
n_eval_trials = 20 
n_steps_per_trial = 1
learning_rate = 1e-4
log_every_n_samples = 50
# Number of features on conv-mols
n_feat = 71

sider_tasks, sider_dataset, _ = load_sider_convmol()
sider_dataset = to_numpy_dataset(sider_dataset)
tox21_tasks, tox21_dataset, _ = load_tox21_convmol()
tox21_dataset = to_numpy_dataset(tox21_dataset)

# Define metric
metric = dc.metrics.Metric(
    dc.metrics.roc_auc_score, verbosity="high", mode="classification")

# Train support model on train
support_model = dc.nn.SequentialSupportGraph(n_feat)

# Add layers
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.GraphConv(128, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.GraphConv(64, activation='relu'))
support_model.add(dc.nn.GraphPool())
support_model.add(dc.nn.Dense(128, activation='tanh'))

support_model.add_test(dc.nn.GraphGather(test_batch_size, activation='tanh'))
support_model.add_support(dc.nn.GraphGather(support_batch_size, activation='tanh'))

with tf.Session() as sess:
  model = dc.models.SupportGraphClassifier(
    sess, support_model, test_batch_size=test_batch_size,
    support_batch_size=support_batch_size, learning_rate=learning_rate,
    verbosity="high")

  ############################################################ DEBUG
  print("FIT")
  ############################################################ DEBUG
  model.fit(tox21_dataset, nb_epochs=nb_epochs,
            n_episodes_per_epoch=n_train_trials,
            n_pos=n_pos, n_neg=n_neg, log_every_n_samples=log_every_n_samples)
  ############################################################ DEBUG
  print("EVAL")
  ############################################################ DEBUG
  scores = model.evaluate(
      sider_dataset, metric, n_pos, n_neg, n_trials=n_eval_trials)
  print("Scores on evaluation dataset")
  print(scores)