Commit da5be2a8 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #283 from rbharath/experiments

[WIP] Changes for experiments
parents b667a504 b2a84b17
Loading
Loading
Loading
Loading
+41 −0
Original line number Diff line number Diff line
@@ -9,6 +9,47 @@ import time
import numpy as np
from deepchem.data import NumpyDataset

def remove_dead_examples(dataset):
  """Removes compounds with no weight.

  Parameters
  ----------
  dataset: dc.data.Dataset
    Source dataset.
  """
  w = dataset.w
  nonzero_inds = np.nonzero(np.sum(w, axis=1))

  # Remove support indices
  X = dataset.X[nonzero_inds]
  y = dataset.y[nonzero_inds]
  w = dataset.w[nonzero_inds]
  ids = dataset.ids[nonzero_inds]

  return NumpyDataset(X, y, w, ids)

def dataset_difference(dataset, remove):
  """Removes the compounds in remove from dataset.

  Parameters
  ----------
  dataset: dc.data.Dataset
    Source dataset.
  remove: dc.data.Dataset
    Dataset whose overlap will be removed.
  """
  remove_ids = set(remove.ids)
  keep_inds = [ind for ind in range(len(dataset))
               if dataset.ids[ind] not in remove_ids]

  # Remove support indices
  X = dataset.X[keep_inds]
  y = dataset.y[keep_inds]
  w = dataset.w[keep_inds]
  ids = dataset.ids[keep_inds]

  return NumpyDataset(X, y, w, ids)

def get_task_dataset_minus_support(dataset, support, task):
  """Gets data for specified task, minus support points.

+81 −0
Original line number Diff line number Diff line
@@ -19,6 +19,27 @@ class TestSupports(unittest.TestCase):
  Test that support generation happens properly.
  """

  def test_remove_dead_examples(self):
    """Tests that examples with zero weight are removed."""
    n_samples = 100
    n_features = 3
    n_tasks = 1
    
    # Generate dummy dataset
    np.random.seed(123)
    p = .05
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.binomial(1, p, size=(n_samples, n_tasks))
    w = np.random.binomial(1, p, size=(n_samples, n_tasks))

    num_nonzero = np.count_nonzero(np.sum(w, axis=1))
  
    dataset = dc.data.NumpyDataset(X, y, w, ids)
  
    cleared_dataset = dc.data.remove_dead_examples(dataset)
    assert len(cleared_dataset) == num_nonzero

  def test_get_task_support_simple(self):
    """Tests that get_task_support samples correctly."""
    n_samples = 20
@@ -181,6 +202,34 @@ class TestSupports(unittest.TestCase):
    np.testing.assert_array_equal(task_dataset.w, w[n_support:]) 
    np.testing.assert_array_equal(task_dataset.ids, ids[n_support:]) 

  def test_dataset_difference_simple(self):
    """Test that fixed index can be removed from dataset."""
    n_samples = 20
    n_remove = 5
    n_features = 3
    n_tasks = 1
    
    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.randint(2, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    dataset = dc.data.NumpyDataset(X, y, w, ids)

    remove_dataset = dc.data.NumpyDataset(X[:n_remove], y[:n_remove],
                                          w[:n_remove], ids[:n_remove])

    out_dataset = dc.data.dataset_difference(
        dataset, remove_dataset)

    # Assert all remove elements have been removed
    assert len(out_dataset) == n_samples - n_remove
    np.testing.assert_array_equal(out_dataset.X, X[n_remove:]) 
    np.testing.assert_array_equal(out_dataset.y, y[n_remove:]) 
    np.testing.assert_array_equal(out_dataset.w, w[n_remove:]) 
    np.testing.assert_array_equal(out_dataset.ids, ids[n_remove:]) 

  def test_get_task_minus_support(self):
    """Test that random index support can be removed from dataset."""
    n_samples = 10
@@ -212,6 +261,38 @@ class TestSupports(unittest.TestCase):
    np.testing.assert_array_equal(task_dataset.w, w[data_inds]) 
    np.testing.assert_array_equal(task_dataset.ids, ids[data_inds]) 

  def test_dataset_difference(self):
    """Test that random index can be removed from dataset."""
    n_samples = 10
    n_remove = 4 
    n_features = 3
    n_tasks = 1
    
    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.randint(2, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    dataset = dc.data.NumpyDataset(X, y, w, ids)

    remove_inds = sorted(np.random.choice(
        np.arange(n_samples), (n_remove,), replace=False))
    remove_dataset = dc.data.NumpyDataset(X[remove_inds], y[remove_inds],
                                          w[remove_inds], ids[remove_inds])

    out_dataset = dc.data.dataset_difference(
        dataset, remove_dataset)

    # Assert all remove elements have been removed
    data_inds = sorted(list(set(range(n_samples)) - set(remove_inds)))
    assert len(out_dataset) == n_samples - n_remove
    np.testing.assert_array_equal(out_dataset.X, X[data_inds]) 
    np.testing.assert_array_equal(out_dataset.y, y[data_inds]) 
    np.testing.assert_array_equal(out_dataset.w, w[data_inds]) 
    np.testing.assert_array_equal(out_dataset.ids, ids[data_inds]) 


  def test_get_task_minus_support_missing(self):
    """Test that support can be removed from dataset with missing data"""
    n_samples = 20
+1 −1
Original line number Diff line number Diff line
@@ -340,7 +340,7 @@ class SupportGraphClassifier(Model):
    feed_dict = self.construct_feed_dict(padded_test_batch, support)
    # Get scores
    pred, scores = self.sess.run([self.pred_op, self.scores_op], feed_dict=feed_dict)
    y_pred_batch = np.round(scores)
    y_pred_batch = np.round(pred)
    ########################################################### DEBUG
    # Remove padded elements
    y_pred_batch = y_pred_batch[:n_samples]
+66 −0
Original line number Diff line number Diff line
@@ -12,6 +12,10 @@ import tempfile
import numpy as np
import deepchem as dc

def to_numpy_dataset(dataset):
  """Converts dataset to numpy dataset."""
  return dc.data.NumpyDataset(dataset.X, dataset.y, dataset.w, dataset.ids)

def load_tox21_ecfp(num_train=7200):
  """Load Tox21 datasets. Does not do train/test split"""
  # Set some global variables up top
@@ -128,3 +132,65 @@ def load_muv_convmol():
      dataset = transformer.transform(dataset)

  return MUV_tasks, dataset, transformers

def load_sider_ecfp():
  """Load SIDER datasets. Does not do train/test split"""
  # Featurize SIDER dataset
  print("About to featurize SIDER dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "../sider/sider.csv.gz")
  featurizer = dc.feat.CircularFingerprint(size=1024)

  dataset = dc.utils.save.load_from_disk(dataset_file)
  SIDER_tasks = dataset.columns.values[1:].tolist()
  print("SIDER tasks: %s" % str(SIDER_tasks))
  print("%d tasks in total" % len(SIDER_tasks))


  loader = dc.load.DataLoader(tasks=SIDER_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity="high")
  dataset = loader.featurize(dataset_file)
  print("%d datapoints in SIDER dataset" % len(dataset))

  # Initialize transformers
  transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
  print("About to transform data")
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  return SIDER_tasks, dataset, transformers

def load_sider_convmol():
  """Load SIDER datasets. Does not do train/test split"""
  # Featurize SIDER dataset
  print("About to featurize SIDER dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "../sider/sider.csv.gz")
  featurizer = dc.feat.ConvMolFeaturizer()

  dataset = dc.utils.save.load_from_disk(dataset_file)
  SIDER_tasks = dataset.columns.values[1:].tolist()
  print("SIDER tasks: %s" % str(SIDER_tasks))
  print("%d tasks in total" % len(SIDER_tasks))


  loader = dc.load.DataLoader(tasks=SIDER_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity="high")
  dataset = loader.featurize(dataset_file, debug=True)
  print("%d datapoints in SIDER dataset" % len(dataset))

  # Initialize transformers
  transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
  print("About to transform data")
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  return SIDER_tasks, dataset, transformers
+1 −1
Original line number Diff line number Diff line
@@ -17,7 +17,7 @@ K = 4
max_depth = 3
# number positive/negative ligands
n_pos = 1
n_neg = 5
n_neg = 1
# Set batch sizes for network
test_batch_size = 128
support_batch_size = n_pos + n_neg
Loading