Unverified Commit 0fcc6b40 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #2239 from deepchem/seq2seq

Adding in a batch of new save/reload tests
parents e06055e7 4019d552
Loading
Loading
Loading
Loading
+199 −85
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@ import deepchem as dc
import numpy as np
import tensorflow as tf
import unittest
import tempfile
from tensorflow.keras.layers import Input, Concatenate, Dense
from flaky import flaky

@@ -49,17 +50,13 @@ class ExampleGAN(dc.models.GAN):
    return tf.keras.Model(inputs=inputs, outputs=output)


class TestGAN(unittest.TestCase):

@flaky
  def test_cgan(self):
def test_cgan():
  """Test fitting a conditional GAN."""

  gan = ExampleGAN(learning_rate=0.01)
  gan.fit_gan(
        generate_data(gan, 500, 100),
        generator_steps=0.5,
        checkpoint_interval=0)
      generate_data(gan, 500, 100), generator_steps=0.5, checkpoint_interval=0)

  # See if it has done a plausible job of learning the distribution.

@@ -70,15 +67,75 @@ class TestGAN(unittest.TestCase):
  assert np.std(deltas) > 1.0
  assert gan.get_global_step() == 500


@flaky
def test_cgan_reload():
  """Test reloading a conditional GAN."""

  model_dir = tempfile.mkdtemp()
  gan = ExampleGAN(learning_rate=0.01, model_dir=model_dir)
  gan.fit_gan(generate_data(gan, 500, 100), generator_steps=0.5)

  # See if it has done a plausible job of learning the distribution.
  means = 10 * np.random.random([1000, 1])
  batch_size = len(means)
  noise_input = gan.get_noise_batch(batch_size=batch_size)
  values = gan.predict_gan_generator(
      noise_input=noise_input, conditional_inputs=[means])
  deltas = values - means
  assert abs(np.mean(deltas)) < 1.0
  assert np.std(deltas) > 1.0
  assert gan.get_global_step() == 500

  reloaded_gan = ExampleGAN(learning_rate=0.01, model_dir=model_dir)
  reloaded_gan.restore()
  reloaded_values = reloaded_gan.predict_gan_generator(
      noise_input=noise_input, conditional_inputs=[means])

  assert np.all(values == reloaded_values)


@flaky
  def test_mix_gan(self):
def test_mix_gan_reload():
  """Test reloading a GAN with multiple generators and discriminators."""

  model_dir = tempfile.mkdtemp()
  gan = ExampleGAN(
      n_generators=2,
      n_discriminators=2,
      learning_rate=0.01,
      model_dir=model_dir)
  gan.fit_gan(generate_data(gan, 1000, 100), generator_steps=0.5)

  reloaded_gan = ExampleGAN(
      n_generators=2,
      n_discriminators=2,
      learning_rate=0.01,
      model_dir=model_dir)
  reloaded_gan.restore()
  # See if it has done a plausible job of learning the distribution.

  means = 10 * np.random.random([1000, 1])
  batch_size = len(means)
  noise_input = gan.get_noise_batch(batch_size=batch_size)
  for i in range(2):
    values = gan.predict_gan_generator(
        noise_input=noise_input, conditional_inputs=[means], generator_index=i)
    reloaded_values = reloaded_gan.predict_gan_generator(
        noise_input=noise_input, conditional_inputs=[means], generator_index=i)
    assert np.all(values == reloaded_values)
  assert gan.get_global_step() == 1000
  # No training has been done after reload
  assert reloaded_gan.get_global_step() == 0


@flaky
def test_mix_gan():
  """Test a GAN with multiple generators and discriminators."""

  gan = ExampleGAN(n_generators=2, n_discriminators=2, learning_rate=0.01)
  gan.fit_gan(
        generate_data(gan, 1000, 100),
        generator_steps=0.5,
        checkpoint_interval=0)
      generate_data(gan, 1000, 100), generator_steps=0.5, checkpoint_interval=0)

  # See if it has done a plausible job of learning the distribution.

@@ -91,8 +148,9 @@ class TestGAN(unittest.TestCase):
    assert np.std(deltas) > 1.0
  assert gan.get_global_step() == 1000


@flaky
  def test_wgan(self):
def test_wgan():
  """Test fitting a conditional WGAN."""

  class ExampleWGAN(dc.models.WGAN):
@@ -137,3 +195,59 @@ class TestGAN(unittest.TestCase):
  deltas = values - means
  assert abs(np.mean(deltas)) < 1.0
  assert np.std(deltas) > 1.0


@flaky
def test_wgan_reload():
  """Test fitting a conditional WGAN."""

  class ExampleWGAN(dc.models.WGAN):

    def get_noise_input_shape(self):
      return (2,)

    def get_data_input_shapes(self):
      return [(1,)]

    def get_conditional_input_shapes(self):
      return [(1,)]

    def create_generator(self):
      noise_input = Input(self.get_noise_input_shape())
      conditional_input = Input(self.get_conditional_input_shapes()[0])
      inputs = [noise_input, conditional_input]
      gen_in = Concatenate(axis=1)(inputs)
      output = Dense(1)(gen_in)
      return tf.keras.Model(inputs=inputs, outputs=output)

    def create_discriminator(self):
      data_input = Input(self.get_data_input_shapes()[0])
      conditional_input = Input(self.get_conditional_input_shapes()[0])
      inputs = [data_input, conditional_input]
      discrim_in = Concatenate(axis=1)(inputs)
      dense = Dense(10, activation=tf.nn.relu)(discrim_in)
      output = Dense(1)(dense)
      return tf.keras.Model(inputs=inputs, outputs=output)

  # We have to set the gradient penalty very small because the generator's
  # output is only a single number, so the default penalty would constrain
  # it far too much.

  model_dir = tempfile.mkdtemp()
  gan = ExampleWGAN(
      learning_rate=0.01, gradient_penalty=0.1, model_dir=model_dir)
  gan.fit_gan(generate_data(gan, 1000, 100), generator_steps=0.1)

  reloaded_gan = ExampleWGAN(
      learning_rate=0.01, gradient_penalty=0.1, model_dir=model_dir)
  reloaded_gan.restore()

  # See if it has done a plausible job of learning the distribution.
  means = 10 * np.random.random([1000, 1])
  batch_size = len(means)
  noise_input = gan.get_noise_batch(batch_size=batch_size)
  values = gan.predict_gan_generator(
      noise_input=noise_input, conditional_inputs=[means])
  reloaded_values = reloaded_gan.predict_gan_generator(
      noise_input=noise_input, conditional_inputs=[means])
  assert np.all(values == reloaded_values)
+2 −2
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@ from sklearn.model_selection import train_test_split
import deepchem as dc


def test_signletask_regression_with_xgboost():
def test_singletask_regression_with_xgboost():
  np.random.seed(123)

  # prepare dataset
@@ -41,7 +41,7 @@ def test_signletask_regression_with_xgboost():
  assert scores[regression_metric.name] < 55


def test_signletask_regression_with_lightgbm():
def test_singletask_regression_with_lightgbm():
  np.random.seed(123)

  # prepare dataset
+79 −0
Original line number Diff line number Diff line
@@ -1043,3 +1043,82 @@ def test_DTNN_regression_reload():
  origpred = model.predict(dataset)
  reloadpred = reloaded_model.predict(dataset)
  assert np.all(origpred == reloadpred)


def generate_sequences(sequence_length, num_sequences):
  for i in range(num_sequences):
    seq = [
        np.random.randint(10)
        for x in range(np.random.randint(1, sequence_length + 1))
    ]
    yield (seq, seq)


def test_seq2seq_reload():
  """Test reloading for seq2seq models."""

  sequence_length = 8
  tokens = list(range(10))
  model_dir = tempfile.mkdtemp()
  s = dc.models.SeqToSeq(
      tokens,
      tokens,
      sequence_length,
      encoder_layers=2,
      decoder_layers=2,
      embedding_dimension=150,
      learning_rate=0.01,
      dropout=0.1,
      model_dir=model_dir)

  # Train the model on random sequences.  We aren't training long enough to
  # really make it reliable, but I want to keep this test fast, and it should
  # still be able to reproduce a reasonable fraction of input sequences.

  s.fit_sequences(generate_sequences(sequence_length, 25000))

  # Test it out.

  tests = [seq for seq, target in generate_sequences(sequence_length, 50)]
  pred1 = s.predict_from_sequences(tests, beam_width=1)
  pred4 = s.predict_from_sequences(tests, beam_width=4)

  reloaded_s = dc.models.SeqToSeq(
      tokens,
      tokens,
      sequence_length,
      encoder_layers=2,
      decoder_layers=2,
      embedding_dimension=150,
      learning_rate=0.01,
      dropout=0.1,
      model_dir=model_dir)
  reloaded_s.restore()

  reloaded_pred1 = reloaded_s.predict_from_sequences(tests, beam_width=1)
  assert len(pred1) == len(reloaded_pred1)
  for (p1, r1) in zip(pred1, reloaded_pred1):
    assert p1 == r1
  reloaded_pred4 = reloaded_s.predict_from_sequences(tests, beam_width=4)
  assert len(pred4) == len(reloaded_pred4)
  for (p4, r4) in zip(pred4, reloaded_pred4):
    assert p4 == r4
  embeddings = s.predict_embeddings(tests)
  pred1e = s.predict_from_embeddings(embeddings, beam_width=1)
  pred4e = s.predict_from_embeddings(embeddings, beam_width=4)

  reloaded_embeddings = reloaded_s.predict_embeddings(tests)
  reloaded_pred1e = reloaded_s.predict_from_embeddings(
      reloaded_embeddings, beam_width=1)
  reloaded_pred4e = reloaded_s.predict_from_embeddings(
      reloaded_embeddings, beam_width=4)

  assert np.all(embeddings == reloaded_embeddings)

  assert len(pred1e) == len(reloaded_pred1e)
  for (p1e, r1e) in zip(pred1e, reloaded_pred1e):
    assert p1e == r1e

  assert len(pred4e) == len(reloaded_pred4e)
  for (p4e, r4e) in zip(pred4e, reloaded_pred4e):
    assert p4e == r4e
+31 −4
Original line number Diff line number Diff line
import unittest

import deepchem
import tempfile
import deepchem as dc
import numpy as np


@@ -16,10 +16,37 @@ class TestScScoreModel(unittest.TestCase):

    X = np.random.rand(n_samples, 2, n_features)
    y = np.random.randint(2, size=(n_samples, n_tasks))
    dataset = deepchem.data.NumpyDataset(X, y)
    dataset = dc.data.NumpyDataset(X, y)

    model = dc.models.ScScoreModel(n_features, dropouts=0)

    model.fit(dataset, nb_epoch=100)
    pred = model.predict(dataset)
    assert np.array_equal(y, pred[0] > pred[1])


def test_scscore_reload():
  """Test reloading of ScScoreModel"""
  n_samples = 10
  n_features = 3
  n_tasks = 1

  # Create a dataset and an input function for processing it.

    model = deepchem.models.ScScoreModel(n_features, dropouts=0)
  X = np.random.rand(n_samples, 2, n_features)
  y = np.random.randint(2, size=(n_samples, n_tasks))
  dataset = dc.data.NumpyDataset(X, y)

  model_dir = tempfile.mkdtemp()
  model = dc.models.ScScoreModel(n_features, dropouts=0, model_dir=model_dir)
  model.fit(dataset, nb_epoch=100)
  pred = model.predict(dataset)
  assert np.array_equal(y, pred[0] > pred[1])

  reloaded_model = dc.models.ScScoreModel(
      n_features, dropouts=0, model_dir=model_dir)
  reloaded_model.restore()
  reloaded_pred = reloaded_model.predict(dataset)
  assert len(pred) == len(reloaded_pred)
  for p, r in zip(pred, reloaded_pred):
    assert np.all(p == r)