Unverified Commit cd7d2c1b authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #2219 from deepchem/robust

Fixing singletask robust multitask regression
parents 70ebcbc2 4ac406ef
Loading
Loading
Loading
Loading
+19 −2
Original line number Diff line number Diff line
@@ -3,10 +3,12 @@ import tensorflow as tf
import collections

import logging
import deepchem as dc
from deepchem.metrics import to_one_hot
from deepchem.models import KerasModel
from deepchem.models.layers import Stack
from deepchem.models.losses import SoftmaxCrossEntropy, L2Loss
from typing import Tuple, Iterable, List

logger = logging.getLogger(__name__)

@@ -348,6 +350,21 @@ class RobustMultitaskRegressor(KerasModel):
      task_out = tf.keras.layers.Dense(1)(task_layer)
      task_outputs.append(task_out)

    outputs = tf.keras.layers.Concatenate(axis=1)(task_outputs)
    outputs = Stack(axis=1)(task_outputs)
    model = tf.keras.Model(inputs=mol_features, outputs=outputs)
    super(RobustMultitaskRegressor, self).__init__(model, L2Loss(), **kwargs)
    super(RobustMultitaskRegressor, self).__init__(
        model, L2Loss(), output_types=['prediction'], **kwargs)

  def default_generator(
      self,
      dataset: dc.data.Dataset,
      epochs: int = 1,
      mode: str = 'fit',
      deterministic: bool = True,
      pad_batches: bool = True) -> Iterable[Tuple[List, List, List]]:
    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):
        yield ([X_b], [y_b], [w_b])
+2 −2
Original line number Diff line number Diff line
@@ -538,8 +538,8 @@ def test_residual_regression_overfit():
  assert scores[regression_metric.name] < .02


def test_tf_robust_multitask_regression_overfit():
  """Test tf robust multitask overfits tiny data."""
def test_robust_multitask_regression_overfit():
  """Test robust multitask overfits tiny data."""
  np.random.seed(123)
  tf.random.set_seed(123)
  n_tasks = 10
+58 −57
Original line number Diff line number Diff line
@@ -278,63 +278,64 @@ def test_robust_multitask_classification_reload():
  assert scores[classification_metric.name] > .9


# TODO: THIS DOESN'T WORK!!
#def test_robust_multitask_regressor_reload():
#  """Test that RobustMultitaskRegressor can be reloaded correctly."""
#  n_tasks = 10
#  n_samples = 10
#  n_features = 3
#
#  # Generate dummy dataset
#  np.random.seed(123)
#  ids = np.arange(n_samples)
#  X = np.random.rand(n_samples, n_features)
#  y = np.random.rand(n_samples, n_tasks)
#  w = np.ones((n_samples, n_tasks))
#
#  dataset = dc.data.NumpyDataset(X, y, w, ids)
#  regression_metric = dc.metrics.Metric(dc.metrics.mean_squared_error)
#
#  model_dir = tempfile.mkdtemp()
#  model = dc.models.RobustMultitaskRegressor(
#      n_tasks,
#      n_features,
#      layer_sizes=[50],
#      bypass_layer_sizes=[10],
#      dropouts=[0.],
#      learning_rate=0.003,
#      weight_init_stddevs=[.1],
#      batch_size=n_samples)
#
#  # Fit trained model
#  model.fit(dataset, nb_epoch=100)
#
#  # Eval model on train
#  scores = model.evaluate(dataset, [regression_metric])
#  assert scores[regression_metric.name] < .1
#
#  # Reload trained model
#  reloaded_model = dc.models.RobustMultitaskRegressor(
#      n_tasks,
#      n_features,
#      layer_sizes=[50],
#      bypass_layer_sizes=[10],
#      dropouts=[0.],
#      learning_rate=0.003,
#      weight_init_stddevs=[.1],
#      batch_size=n_samples)
#  reloaded_model.restore()
#
#  # Check predictions match on random sample
#  Xpred = np.random.rand(n_samples, n_features)
#  predset = dc.data.NumpyDataset(Xpred)
#  origpred = model.predict(predset)
#  reloadpred = reloaded_model.predict(predset)
#  assert np.all(origpred == reloadpred)
#
#  # Eval model on train
#  scores = reloaded_model.evaluate(dataset, [regression_metric])
#  assert scores[regression_metric.name] < 0.1
def test_robust_multitask_regressor_reload():
  """Test that RobustMultitaskRegressor can be reloaded correctly."""
  n_tasks = 10
  n_samples = 10
  n_features = 3

  # Generate dummy dataset
  np.random.seed(123)
  ids = np.arange(n_samples)
  X = np.random.rand(n_samples, n_features)
  y = np.random.rand(n_samples, n_tasks)
  w = np.ones((n_samples, n_tasks))

  dataset = dc.data.NumpyDataset(X, y, w, ids)
  regression_metric = dc.metrics.Metric(dc.metrics.mean_squared_error)

  model_dir = tempfile.mkdtemp()
  model = dc.models.RobustMultitaskRegressor(
      n_tasks,
      n_features,
      layer_sizes=[50],
      bypass_layer_sizes=[10],
      dropouts=[0.],
      learning_rate=0.003,
      weight_init_stddevs=[.1],
      batch_size=n_samples,
      model_dir=model_dir)

  # Fit trained model
  model.fit(dataset, nb_epoch=100)

  # Eval model on train
  scores = model.evaluate(dataset, [regression_metric])
  assert scores[regression_metric.name] < .1

  # Reload trained model
  reloaded_model = dc.models.RobustMultitaskRegressor(
      n_tasks,
      n_features,
      layer_sizes=[50],
      bypass_layer_sizes=[10],
      dropouts=[0.],
      learning_rate=0.003,
      weight_init_stddevs=[.1],
      batch_size=n_samples,
      model_dir=model_dir)
  reloaded_model.restore()

  # Check predictions match on random sample
  Xpred = np.random.rand(n_samples, n_features)
  predset = dc.data.NumpyDataset(Xpred)
  origpred = model.predict(predset)
  reloadpred = reloaded_model.predict(predset)
  assert np.all(origpred == reloadpred)

  # Eval model on train
  scores = reloaded_model.evaluate(dataset, [regression_metric])
  assert scores[regression_metric.name] < 0.1


def test_IRV_multitask_classification_reload():
+68 −0
Original line number Diff line number Diff line
import numpy as np
import tensorflow as tf
import deepchem as dc


def test_singletask_robust_multitask_classification():
  """Test robust multitask singletask classification."""
  n_tasks = 1
  n_samples = 10
  n_features = 3
  n_classes = 2

  # Generate dummy dataset
  np.random.seed(123)
  ids = np.arange(n_samples)
  X = np.random.rand(n_samples, n_features)
  y = np.zeros((n_samples, n_tasks))
  w = np.ones((n_samples, n_tasks))
  dataset = dc.data.NumpyDataset(X, y, w, ids)

  classification_metric = dc.metrics.Metric(
      dc.metrics.accuracy_score, task_averager=np.mean)
  model = dc.models.RobustMultitaskClassifier(
      n_tasks,
      n_features,
      layer_sizes=[50],
      bypass_layer_sizes=[10],
      dropouts=[0.],
      learning_rate=0.003,
      weight_init_stddevs=[.1],
      batch_size=n_samples)

  # Fit trained model
  model.fit(dataset, nb_epoch=1)


def test_singletask_robust_multitask_regression():
  """Test singletask robust multitask regression."""
  np.random.seed(123)
  tf.random.set_seed(123)
  n_tasks = 1
  n_samples = 10
  n_features = 3
  n_classes = 2

  # Generate dummy dataset
  np.random.seed(123)
  ids = np.arange(n_samples)
  X = np.random.rand(n_samples, n_features)
  y = np.zeros((n_samples, n_tasks))
  w = np.ones((n_samples, n_tasks))

  dataset = dc.data.NumpyDataset(X, y, w, ids)

  regression_metric = dc.metrics.Metric(
      dc.metrics.mean_squared_error, task_averager=np.mean, mode="regression")
  model = dc.models.RobustMultitaskRegressor(
      n_tasks,
      n_features,
      layer_sizes=[50],
      bypass_layer_sizes=[10],
      dropouts=[0.],
      learning_rate=0.003,
      weight_init_stddevs=[.1],
      batch_size=n_samples)

  # Fit trained model
  model.fit(dataset, nb_epoch=1)