Unverified Commit 02fce357 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #2235 from deepchem/smiles2vec

Adding in some more tests for save/reload
parents 31c9b6bf a01e688d
Loading
Loading
Loading
Loading
+48 −6
Original line number Diff line number Diff line
@@ -2686,7 +2686,14 @@ class DTNNEmbedding(tf.keras.layers.Layer):
    return config

  def build(self, input_shape):
    init = initializers.get(self.init)

    def init(input_shape):
      return self.add_weight(
          name='kernel',
          shape=(input_shape[0], input_shape[1]),
          initializer=self.init,
          trainable=True)

    self.embedding_list = init([self.periodic_table_length, self.n_embedding])
    self.built = True

@@ -2739,7 +2746,14 @@ class DTNNStep(tf.keras.layers.Layer):
    return config

  def build(self, input_shape):
    init = initializers.get(self.init)

    def init(input_shape):
      return self.add_weight(
          name='kernel',
          shape=(input_shape[0], input_shape[1]),
          initializer=self.init,
          trainable=True)

    self.W_cf = init([self.n_embedding, self.n_hidden])
    self.W_df = init([self.n_distance, self.n_hidden])
    self.W_fc = init([self.n_hidden, self.n_embedding])
@@ -2824,7 +2838,14 @@ class DTNNGather(tf.keras.layers.Layer):
  def build(self, input_shape):
    self.W_list = []
    self.b_list = []
    init = initializers.get(self.init)

    def init(input_shape):
      return self.add_weight(
          name='kernel',
          shape=(input_shape[0], input_shape[1]),
          initializer=self.init,
          trainable=True)

    prev_layer_size = self.n_embedding
    for i, layer_size in enumerate(self.layer_sizes):
      self.W_list.append(init([prev_layer_size, layer_size]))
@@ -3230,9 +3251,16 @@ class EdgeNetwork(tf.keras.layers.Layer):
    return config

  def build(self, input_shape):

    def init(input_shape):
      return self.add_weight(
          name='kernel',
          shape=(input_shape[0], input_shape[1]),
          initializer=self.init,
          trainable=True)

    n_pair_features = self.n_pair_features
    n_hidden = self.n_hidden
    init = initializers.get(self.init)
    self.W = init([n_pair_features, n_hidden * n_hidden])
    self.b = backend.zeros(shape=(n_hidden * n_hidden,))
    self.built = True
@@ -3262,7 +3290,14 @@ class GatedRecurrentUnit(tf.keras.layers.Layer):

  def build(self, input_shape):
    n_hidden = self.n_hidden
    init = initializers.get(self.init)

    def init(input_shape):
      return self.add_weight(
          name='kernel',
          shape=(input_shape[0], input_shape[1]),
          initializer=self.init,
          trainable=True)

    self.Wz = init([n_hidden, n_hidden])
    self.Wr = init([n_hidden, n_hidden])
    self.Wh = init([n_hidden, n_hidden])
@@ -3317,7 +3352,14 @@ class SetGather(tf.keras.layers.Layer):
    return config

  def build(self, input_shape):
    init = initializers.get(self.init)

    def init(input_shape):
      return self.add_weight(
          name='kernel',
          shape=(input_shape[0], input_shape[1]),
          initializer=self.init,
          trainable=True)

    self.U = init((2 * self.n_hidden, 4 * self.n_hidden))
    self.b = tf.Variable(
        np.concatenate((np.zeros(self.n_hidden), np.ones(self.n_hidden),
+243 −160
Original line number Diff line number Diff line
@@ -8,9 +8,11 @@ import tempfile
import numpy as np
import deepchem as dc
import tensorflow as tf
import scipy
from flaky import flaky
from sklearn.ensemble import RandomForestClassifier
from deepchem.molnet.load_function.chembl25_datasets import chembl25_tasks
from deepchem.feat import create_char_to_idx


def test_sklearn_classifier_reload():
@@ -527,7 +529,6 @@ def test_DAG_regression_reload():
  np.random.seed(123)
  tf.random.set_seed(123)
  n_tasks = 1
  #current_dir = os.path.dirname(os.path.abspath(__file__))

  # Load mini log-solubility dataset.
  featurizer = dc.feat.ConvMolFeaturizer()
@@ -655,158 +656,142 @@ def test_weave_classification_reload():
  assert scores[classification_metric.name] > .6


# TODO: THIS IS FAILING!
#def test_MPNN_regression_reload():
#  """Test MPNN can reload datasets."""
#  np.random.seed(123)
#  tf.random.set_seed(123)
#  n_tasks = 1
#
#  # Load mini log-solubility dataset.
#  featurizer = dc.feat.WeaveFeaturizer()
#  tasks = ["outcome"]
#  mols = ["C", "CO", "CC"]
#  n_samples = len(mols)
#  X = featurizer(mols)
#  y = np.random.rand(n_samples, n_tasks)
#  dataset = dc.data.NumpyDataset(X, y)
#
#  regression_metric = dc.metrics.Metric(
#      dc.metrics.pearson_r2_score, task_averager=np.mean)
#
#  n_atom_feat = 75
#  n_pair_feat = 14
#  batch_size = 10
#  model_dir = tempfile.mkdtemp()
#  model = dc.models.MPNNModel(
#      n_tasks,
#      n_atom_feat=n_atom_feat,
#      n_pair_feat=n_pair_feat,
#      T=2,
#      M=3,
#      batch_size=batch_size,
#      learning_rate=0.001,
#      use_queue=False,
#      mode="regression",
#      model_dir=model_dir)
#
#  # Fit trained model
#  model.fit(dataset, nb_epoch=50)
#
#  # Eval model on train
#  scores = model.evaluate(dataset, [regression_metric])
#  assert scores[regression_metric.name] > .8
#
#  # Custom save
#  save_dir = tempfile.mkdtemp()
#  model.model.save(save_dir)
#
#  from tensorflow import keras
#  reloaded = keras.models.load_model(save_dir)
#
#  # Reload trained model
#  reloaded_model = dc.models.MPNNModel(
#      n_tasks,
#      n_atom_feat=n_atom_feat,
#      n_pair_feat=n_pair_feat,
#      T=2,
#      M=3,
#      batch_size=batch_size,
#      learning_rate=0.001,
#      use_queue=False,
#      mode="regression",
#      model_dir=model_dir)
#  #reloaded_model.restore()
#  reloaded_model.model = reloaded
#
#  # Eval model on train
#  scores = reloaded_model.evaluate(dataset, [regression_metric])
#  assert scores[regression_metric.name] > .8
#
#  # Check predictions match on random sample
#  predmols = ["CCCC", "CCCCCO", "CCCCC"]
#  Xpred = featurizer(predmols)
#  predset = dc.data.NumpyDataset(Xpred)
#  origpred = model.predict(predset)
#  reloadpred = reloaded_model.predict(predset)
#  print("np.amax(origpred - reloadpred)")
#  print(np.amax(origpred - reloadpred))
#  assert np.all(origpred == reloadpred)
def test_MPNN_regression_reload():
  """Test MPNN can reload datasets."""
  np.random.seed(123)
  tf.random.set_seed(123)
  n_tasks = 1

## TODO: THIS IS FAILING!
#def test_textCNN_classification_reload():
#  """Test textCNN model reloadinng."""
#  np.random.seed(123)
#  tf.random.set_seed(123)
#  n_tasks = 1
#
#  featurizer = dc.feat.RawFeaturizer()
#  tasks = ["outcome"]
#  mols = ["C", "CO", "CC"]
#  n_samples = len(mols)
#  X = featurizer(mols)
#  y = np.random.randint(2, size=(n_samples, n_tasks))
#  dataset = dc.data.NumpyDataset(X, y, ids=mols)
#
#  classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
#
#  char_dict, length = dc.models.TextCNNModel.build_char_dict(dataset)
#  batch_size = 3
#
#  model_dir = tempfile.mkdtemp()
#  model = dc.models.TextCNNModel(
#      n_tasks,
#      char_dict,
#      seq_length=length,
#      batch_size=batch_size,
#      learning_rate=0.001,
#      use_queue=False,
#      mode="classification",
#      model_dir=model_dir)
#
#  # Fit trained model
#  model.fit(dataset, nb_epoch=200)
#
#  # Eval model on train
#  scores = model.evaluate(dataset, [classification_metric])
#  assert scores[classification_metric.name] > .8
#
#  # Reload trained model
#  reloaded_model = dc.models.TextCNNModel(
#      n_tasks,
#      char_dict,
#      seq_length=length,
#      batch_size=batch_size,
#      learning_rate=0.001,
#      use_queue=False,
#      mode="classification",
#      model_dir=model_dir)
#  reloaded_model.restore()
#
#  assert len(reloaded_model.model.get_weights()) == len(
#      model.model.get_weights())
#  for (reloaded, orig) in zip(reloaded_model.model.get_weights(),
#                              model.model.get_weights()):
#    assert np.all(reloaded == orig)
#
#  # Check predictions match on random sample
#  predmols = ["CCCC", "CCCCCO", "CCCCC"]
#  Xpred = featurizer(predmols)
#  predset = dc.data.NumpyDataset(Xpred, ids=predmols)
#  origpred = model.predict(predset)
#  reloadpred = reloaded_model.predict(predset)
#
#  Xproc = reloaded_model.smiles_to_seq_batch(np.array(predmols))
#  reloadout = reloaded_model.model(Xproc)
#  origout = model.model(Xproc)
#
#  assert len(model.model.layers) == len(reloaded_model.model.layers)
#
#  assert np.all(origpred == reloadpred)
#
#  # Eval model on train
#  scores = reloaded_model.evaluate(dataset, [classification_metric])
#  assert scores[classification_metric.name] > .8
  # Load mini log-solubility dataset.
  featurizer = dc.feat.WeaveFeaturizer()
  tasks = ["outcome"]
  mols = ["C", "CO", "CC"]
  n_samples = len(mols)
  X = featurizer(mols)
  y = np.random.rand(n_samples, n_tasks)
  dataset = dc.data.NumpyDataset(X, y)

  regression_metric = dc.metrics.Metric(
      dc.metrics.pearson_r2_score, task_averager=np.mean)

  n_atom_feat = 75
  n_pair_feat = 14
  batch_size = 10
  model_dir = tempfile.mkdtemp()
  model = dc.models.MPNNModel(
      n_tasks,
      n_atom_feat=n_atom_feat,
      n_pair_feat=n_pair_feat,
      T=2,
      M=3,
      batch_size=batch_size,
      learning_rate=0.001,
      use_queue=False,
      mode="regression",
      model_dir=model_dir)

  # Fit trained model
  model.fit(dataset, nb_epoch=50)

  # Eval model on train
  scores = model.evaluate(dataset, [regression_metric])
  assert scores[regression_metric.name] > .8

  # Reload trained model
  reloaded_model = dc.models.MPNNModel(
      n_tasks,
      n_atom_feat=n_atom_feat,
      n_pair_feat=n_pair_feat,
      T=2,
      M=3,
      batch_size=batch_size,
      learning_rate=0.001,
      use_queue=False,
      mode="regression",
      model_dir=model_dir)
  reloaded_model.restore()

  # Eval model on train
  scores = reloaded_model.evaluate(dataset, [regression_metric])
  assert scores[regression_metric.name] > .8

  # Check predictions match on random sample
  predmols = ["CCCC", "CCCCCO", "CCCCC"]
  Xpred = featurizer(predmols)
  predset = dc.data.NumpyDataset(Xpred)
  origpred = model.predict(predset)
  reloadpred = reloaded_model.predict(predset)
  assert np.all(origpred == reloadpred)


def test_textCNN_classification_reload():
  """Test textCNN model reloadinng."""
  np.random.seed(123)
  tf.random.set_seed(123)
  n_tasks = 1

  featurizer = dc.feat.RawFeaturizer()
  tasks = ["outcome"]
  mols = ["C", "CO", "CC"]
  n_samples = len(mols)
  X = featurizer(mols)
  y = np.random.randint(2, size=(n_samples, n_tasks))
  dataset = dc.data.NumpyDataset(X, y, ids=mols)

  classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score)

  char_dict, length = dc.models.TextCNNModel.build_char_dict(dataset)
  batch_size = 3

  model_dir = tempfile.mkdtemp()
  model = dc.models.TextCNNModel(
      n_tasks,
      char_dict,
      seq_length=length,
      batch_size=batch_size,
      learning_rate=0.001,
      use_queue=False,
      mode="classification",
      model_dir=model_dir)

  # Fit trained model
  model.fit(dataset, nb_epoch=200)

  # Eval model on train
  scores = model.evaluate(dataset, [classification_metric])
  assert scores[classification_metric.name] > .8

  # Reload trained model
  reloaded_model = dc.models.TextCNNModel(
      n_tasks,
      char_dict,
      seq_length=length,
      batch_size=batch_size,
      learning_rate=0.001,
      use_queue=False,
      mode="classification",
      model_dir=model_dir)
  reloaded_model.restore()

  # Eval model on train
  scores = reloaded_model.evaluate(dataset, [classification_metric])
  assert scores[classification_metric.name] > .8

  assert len(reloaded_model.model.get_weights()) == len(
      model.model.get_weights())
  for (reloaded, orig) in zip(reloaded_model.model.get_weights(),
                              model.model.get_weights()):
    assert np.all(reloaded == orig)

  # Check predictions match on random sample
  predmols = ["CCCC", "CCCCCO", "CCCCC"]
  Xpred = featurizer(predmols)
  predset = dc.data.NumpyDataset(Xpred, ids=predmols)
  origpred = model.predict(predset)
  reloadpred = reloaded_model.predict(predset)
  assert np.all(origpred == reloadpred)

  assert len(model.model.layers) == len(reloaded_model.model.layers)


def test_1d_cnn_regression_reload():
@@ -864,7 +849,7 @@ def test_1d_cnn_regression_reload():
  assert scores[regression_metric.name] < 0.1


## TODO: THIS IS FAILING!
### TODO: THIS IS FAILING!
#def test_graphconvmodel_reload():
#  featurizer = dc.feat.ConvMolFeaturizer()
#  tasks = ["outcome"]
@@ -891,12 +876,6 @@ def test_1d_cnn_regression_reload():
#  scores = model.evaluate(dataset, [classification_metric])
#  assert scores[classification_metric.name] >= 0.9
#
#  # Custom save
#  save_dir = tempfile.mkdtemp()
#  model.model.save(save_dir)
#
#  from tensorflow import keras
#  reloaded = keras.models.load_model(save_dir)
#
#  # Reload trained Model
#  reloaded_model = dc.models.GraphConvModel(
@@ -913,7 +892,7 @@ def test_1d_cnn_regression_reload():
#  predset = dc.data.NumpyDataset(Xpred)
#  origpred = model.predict(predset)
#  reloadpred = reloaded_model.predict(predset)
#  #assert np.all(origpred == reloadpred)
#  assert np.all(origpred == reloadpred)
#
#  # Try re-restore
#  reloaded_model.restore()
@@ -967,3 +946,107 @@ def test_chemception_reload():
  origpred = model.predict(predset)
  reloadpred = reloaded_model.predict(predset)
  assert np.all(origpred == reloadpred)


# TODO: This test is a little awkward. The Smiles2Vec model awkwardly depends on a dataset_file being available on disk. This needs to be cleaned up to match the standard model handling API.
def test_smiles2vec_reload():
  """Test that smiles2vec models can be saved and reloaded."""
  dataset_file = os.path.join(os.path.dirname(__file__), "chembl_25_small.csv")
  max_len = 250
  pad_len = 10
  max_seq_len = 20
  char_to_idx = create_char_to_idx(
      dataset_file, max_len=max_len, smiles_field="smiles")
  feat = dc.feat.SmilesToSeq(
      char_to_idx=char_to_idx, max_len=max_len, pad_len=pad_len)

  n_tasks = 5
  data_points = 10

  loader = dc.data.CSVLoader(
      tasks=chembl25_tasks, smiles_field='smiles', featurizer=feat)
  dataset = loader.create_dataset(
      inputs=[dataset_file], shard_size=10000, data_dir=tempfile.mkdtemp())
  y = np.random.randint(0, 2, size=(data_points, n_tasks))
  w = np.ones(shape=(data_points, n_tasks))
  dataset = dc.data.NumpyDataset(dataset.X[:data_points, :max_seq_len], y, w,
                                 dataset.ids[:data_points])

  classsification_metric = dc.metrics.Metric(
      dc.metrics.roc_auc_score, np.mean, mode="classification")

  model_dir = tempfile.mkdtemp()
  model = dc.models.Smiles2Vec(
      char_to_idx=char_to_idx,
      max_seq_len=max_seq_len,
      use_conv=True,
      n_tasks=n_tasks,
      model_dir=model_dir,
      mode="classification")
  model.fit(dataset, nb_epoch=3)

  # Reload Trained Model
  reloaded_model = dc.models.Smiles2Vec(
      char_to_idx=char_to_idx,
      max_seq_len=max_seq_len,
      use_conv=True,
      n_tasks=n_tasks,
      model_dir=model_dir,
      mode="classification")
  reloaded_model.restore()

  # Check predictions match on original dataset
  origpred = model.predict(dataset)
  reloadpred = reloaded_model.predict(dataset)
  assert np.all(origpred == reloadpred)


# TODO: We need a cleaner usage example for this
def test_DTNN_regression_reload():
  """Test DTNN can reload datasets."""
  np.random.seed(123)
  tf.random.set_seed(123)
  n_tasks = 1

  current_dir = os.path.dirname(os.path.abspath(__file__))
  input_file = os.path.join(current_dir, "example_DTNN.mat")
  dataset = scipy.io.loadmat(input_file)
  X = dataset['X']
  y = dataset['T']
  w = np.ones_like(y)
  dataset = dc.data.NumpyDataset(X, y, w, ids=None)
  n_tasks = y.shape[1]

  regression_metric = dc.metrics.Metric(
      dc.metrics.pearson_r2_score, task_averager=np.mean)

  model_dir = tempfile.mkdtemp()
  model = dc.models.DTNNModel(
      n_tasks,
      n_embedding=20,
      n_distance=100,
      learning_rate=1.0,
      model_dir=model_dir,
      mode="regression")

  # Fit trained model
  model.fit(dataset, nb_epoch=250)

  # Eval model on train
  pred = model.predict(dataset)
  mean_rel_error = np.mean(np.abs(1 - pred / y))
  assert mean_rel_error < 0.2

  reloaded_model = dc.models.DTNNModel(
      n_tasks,
      n_embedding=20,
      n_distance=100,
      learning_rate=1.0,
      model_dir=model_dir,
      mode="regression")
  reloaded_model.restore()

  # Check predictions match on random sample
  origpred = model.predict(dataset)
  reloadpred = reloaded_model.predict(dataset)
  assert np.all(origpred == reloadpred)
+27 −15
Original line number Diff line number Diff line
@@ -54,24 +54,36 @@ default_dict = {

class TextCNNModel(KerasModel):
  """ A Convolutional neural network on smiles strings
  Reimplementation of the discriminator module in ORGAN: https://arxiv.org/abs/1705.10843
  Originated from: http://emnlp2014.org/papers/pdf/EMNLP2014181.pdf

  This model applies multiple 1D convolutional filters to the padded strings,
  then max-over-time pooling is applied on all filters, extracting one feature per filter.
  All features are concatenated and transformed through several hidden layers to form predictions.
  Reimplementation of the discriminator module in ORGAN [1]_ .
  Originated from [2]_. 

  This model is initially developed for sentence-level classification tasks, with
  words represented as vectors. In this implementation, SMILES strings are dissected
  into characters and transformed to one-hot vectors in a similar way. The model can
  be used for general molecular-level classification or regression tasks. It is also
  used in the ORGAN model as discriminator.
  This model applies multiple 1D convolutional filters to
  the padded strings, then max-over-time pooling is applied on
  all filters, extracting one feature per filter.  All
  features are concatenated and transformed through several
  hidden layers to form predictions.

  Training of the model only requires SMILES strings input, all featurized datasets
  that include SMILES in the `ids` attribute are accepted. PDBbind, QM7 and QM7b
  are not supported. To use the model, `build_char_dict` should be called first
  before defining the model to build character dict of input dataset, example can
  be found in examples/delaney/delaney_textcnn.py
  This model is initially developed for sentence-level
  classification tasks, with words represented as vectors. In
  this implementation, SMILES strings are dissected into
  characters and transformed to one-hot vectors in a similar
  way. The model can be used for general molecular-level
  classification or regression tasks. It is also used in the
  ORGAN model as discriminator.

  Training of the model only requires SMILES strings input,
  all featurized datasets that include SMILES in the `ids`
  attribute are accepted. PDBbind, QM7 and QM7b are not
  supported. To use the model, `build_char_dict` should be
  called first before defining the model to build character
  dict of input dataset, example can be found in
  examples/delaney/delaney_textcnn.py

  References
  ----------
  .. [1]  Guimaraes, Gabriel Lima, et al. "Objective-reinforced generative adversarial networks (ORGAN) for sequence generation models." arXiv preprint arXiv:1705.10843 (2017).
  .. [2] Kim, Yoon. "Convolutional neural networks for sentence classification." arXiv preprint arXiv:1408.5882 (2014).

  """