Unverified Commit cc7e2ecd authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #2224 from deepchem/chemception

Adding chemception save/reload tests
parents cd7d2c1b 55bdc191
Loading
Loading
Loading
Loading
+128 −122
Original line number Diff line number Diff line
@@ -4,129 +4,135 @@ import numpy as np
import tempfile

import pytest

import deepchem as dc
from deepchem.models import Smiles2Vec, ChemCeption
from deepchem.feat import create_char_to_idx, SmilesToSeq, SmilesToImage
from deepchem.molnet.load_function.chembl25_datasets import chembl25_tasks


@pytest.mark.skip(reason="Unknown")
class TestChemnetModel(unittest.TestCase):

  def setUp(self):
    self.max_seq_len = 20
    self.data_points = 10
    self.n_tasks = 5

  def get_dataset(self, mode="classification", featurizer="smiles2seq"):
    dataset_file = os.path.join(
        os.path.dirname(__file__), "chembl_25_small.csv")
def get_dataset(mode="classification",
                featurizer="smiles2seq",
                max_seq_len=20,
                data_points=10,
                n_tasks=5):
  dataset_file = os.path.join(os.path.dirname(__file__), "chembl_25_small.csv")

  if featurizer == "smiles2seq":
    max_len = 250
    pad_len = 10
      self.char_to_idx = create_char_to_idx(
    char_to_idx = create_char_to_idx(
        dataset_file, max_len=max_len, smiles_field="smiles")
      featurizer = SmilesToSeq(
          char_to_idx=self.char_to_idx, max_len=max_len, pad_len=pad_len)
    feat = SmilesToSeq(
        char_to_idx=char_to_idx, max_len=max_len, pad_len=pad_len)

  elif featurizer == "smiles2img":
    img_size = 80
    img_spec = "engd"
    res = 0.5
      featurizer = SmilesToImage(img_size=img_size, img_spec=img_spec, res=res)
    feat = SmilesToImage(img_size=img_size, img_spec=img_spec, res=res)

  loader = dc.data.CSVLoader(
        tasks=chembl25_tasks, smiles_field='smiles', featurizer=featurizer)
      tasks=chembl25_tasks, smiles_field='smiles', featurizer=feat)
  dataset = loader.create_dataset(
        input_files=[dataset_file],
        shard_size=10000,
        data_dir=tempfile.mkdtemp())
      inputs=[dataset_file], shard_size=10000, data_dir=tempfile.mkdtemp())

    w = np.ones(shape=(self.data_points, self.n_tasks))
  w = np.ones(shape=(data_points, n_tasks))

  if mode == 'classification':
      y = np.random.randint(0, 2, size=(self.data_points, self.n_tasks))
    y = np.random.randint(0, 2, size=(data_points, n_tasks))
    metric = dc.metrics.Metric(
        dc.metrics.roc_auc_score, np.mean, mode="classification")
  else:
      y = np.random.normal(size=(self.data_points, self.n_tasks))
    y = np.random.normal(size=(data_points, n_tasks))
    metric = dc.metrics.Metric(
        dc.metrics.mean_absolute_error, mode="regression")

  if featurizer == "smiles2seq":
      dataset = dc.data.NumpyDataset(
          dataset.X[:self.data_points, :self.max_seq_len], y, w,
          dataset.ids[:self.data_points])
    dataset = dc.data.NumpyDataset(dataset.X[:data_points, :max_seq_len], y, w,
                                   dataset.ids[:data_points])
  else:
      dataset = dc.data.NumpyDataset(dataset.X[:self.data_points], y, w,
                                     dataset.ids[:self.data_points])
    dataset = dc.data.NumpyDataset(dataset.X[:data_points], y, w,
                                   dataset.ids[:data_points])

  if featurizer == "smiles2seq":
    return dataset, metric, char_to_idx
  else:
    return dataset, metric


@pytest.mark.slow
  def test_smiles_to_vec_regression(self):
    dataset, metric = self.get_dataset(
        mode="regression", featurizer="smiles2seq")
    model = Smiles2Vec(
        char_to_idx=self.char_to_idx,
        max_seq_len=self.max_seq_len,
        use_conv=True,
        n_tasks=self.n_tasks,
        model_dir=None,
        mode="regression")
    model.fit(dataset, nb_epoch=500)
def test_chemception_regression():
  n_tasks = 5
  dataset, metric = get_dataset(
      mode="regression", featurizer="smiles2img", n_tasks=n_tasks)
  model = ChemCeption(
      n_tasks=n_tasks, img_spec="engd", model_dir=None, mode="regression")
  model.fit(dataset, nb_epoch=300)
  scores = model.evaluate(dataset, [metric], [])
    assert all(s < 0.1 for s in scores['mean_absolute_error'])
  assert scores['mean_absolute_error'] < 0.1


@pytest.mark.slow
  def test_smiles_to_vec_classification(self):
    dataset, metric = self.get_dataset(
        mode="classification", featurizer="smiles2seq")
    model = Smiles2Vec(
        char_to_idx=self.char_to_idx,
        max_seq_len=self.max_seq_len,
        use_conv=True,
        n_tasks=self.n_tasks,
        model_dir=None,
        mode="classification")
    model.fit(dataset, nb_epoch=500)
def test_chemception_classification():
  n_tasks = 5
  dataset, metric = get_dataset(
      mode="classification", featurizer="smiles2img", n_tasks=n_tasks)
  model = ChemCeption(
      n_tasks=n_tasks, img_spec="engd", model_dir=None, mode="classification")
  model.fit(dataset, nb_epoch=300)
  scores = model.evaluate(dataset, [metric], [])
  assert scores['mean-roc_auc_score'] >= 0.9


@pytest.mark.slow
  def test_chemception_regression(self):
    dataset, metric = self.get_dataset(
        mode="regression", featurizer="smiles2img")
    model = ChemCeption(
        n_tasks=self.n_tasks,
        img_spec="engd",
def test_smiles_to_vec_regression():
  n_tasks = 5
  max_seq_len = 20
  dataset, metric, char_to_idx = get_dataset(
      mode="regression",
      featurizer="smiles2seq",
      n_tasks=n_tasks,
      max_seq_len=max_seq_len)
  model = Smiles2Vec(
      char_to_idx=char_to_idx,
      max_seq_len=max_seq_len,
      use_conv=True,
      n_tasks=n_tasks,
      model_dir=None,
      mode="regression")
    model.fit(dataset, nb_epoch=300)
  model.fit(dataset, nb_epoch=500)
  scores = model.evaluate(dataset, [metric], [])
    assert all(s < 0.1 for s in scores['mean_absolute_error'])
  assert scores['mean_absolute_error'] < 0.1


@pytest.mark.slow
  def test_chemception_classification(self):
    dataset, metric = self.get_dataset(
        mode="classification", featurizer="smiles2img")
    model = ChemCeption(
        n_tasks=self.n_tasks,
        img_spec="engd",
def test_smiles_to_vec_classification():
  n_tasks = 5
  max_seq_len = 20
  dataset, metric, char_to_idx, = get_dataset(
      mode="classification",
      featurizer="smiles2seq",
      n_tasks=n_tasks,
      max_seq_len=max_seq_len)
  model = Smiles2Vec(
      char_to_idx=char_to_idx,
      max_seq_len=max_seq_len,
      use_conv=True,
      n_tasks=n_tasks,
      model_dir=None,
      mode="classification")
    model.fit(dataset, nb_epoch=300)
  model.fit(dataset, nb_epoch=500)
  scores = model.evaluate(dataset, [metric], [])
  assert scores['mean-roc_auc_score'] >= 0.9


@pytest.mark.slow
  def test_chemception_fit_with_augmentation(self):
    dataset, metric = self.get_dataset(
        mode="classification", featurizer="smiles2img")
def test_chemception_fit_with_augmentation():
  n_tasks = 5
  dataset, metric = get_dataset(
      mode="classification", featurizer="smiles2img", n_tasks=n_tasks)
  model = ChemCeption(
        n_tasks=self.n_tasks,
      n_tasks=n_tasks,
      img_spec="engd",
      model_dir=None,
      augment=True,
+45 −44
Original line number Diff line number Diff line
"""
Test reload for trained models.
"""
import os
import pytest
import unittest
import tempfile
@@ -9,6 +10,7 @@ import deepchem as dc
import tensorflow as tf
from flaky import flaky
from sklearn.ensemble import RandomForestClassifier
from deepchem.molnet.load_function.chembl25_datasets import chembl25_tasks


def test_sklearn_classifier_reload():
@@ -1007,47 +1009,46 @@ def test_1d_cnn_regression_reload():
#  scores = reloaded_model.evaluate(dataset, [classification_metric])
#  assert scores[classification_metric.name] > .9

#def test_chemception_reload():
#  """Test that chemception models can be saved and reloaded."""
#  img_size = 80
#  img_spec = "engd"
#  res = 0.5
#  n_tasks = 1
#  featurizer = dc.feat.SmilesToImage(
#      img_size=img_size, img_spec=img_spec, res=res)
#  mols = ["C", "CC", "CCC"]
#  X = featurizer(mols)
#  y = np.array([0, 1, 0])
#  dataset = dc.data.NumpyDataset(X, y, ids=mols)
#  classsification_metric = dc.metrics.Metric(
#      dc.metrics.roc_auc_score, np.mean, mode="classification")
#
#  model_dir = tempfile.mkdtemp()
#  model = dc.models.ChemCeption(
#      n_tasks=n_tasks,
#      img_spec="engd",
#      model_dir=model_dir,
#      mode="classification")
#  model.fit(dataset, nb_epoch=300)
#  scores = model.evaluate(dataset, [metric], [])
#  assert scores[classification_metric.name] >= 0.9
#
#  # Reload Trained Model
#  reloaded_model = dc.models.ChemCeption(
#      n_tasks=n_tasks,
#      img_spec="engd",
#      model_dir=model_dir,
#      mode="classification")
#  reloaded_model.restore()
#
#  # Check predictions match on random sample
#  predmols = ["CCCC", "CCCCCO", "CCCCC"]
#  Xpred = featurizer(predmols)
#  predset = dc.data.NumpyDataset(Xpred)
#  origpred = model.predict(predset)
#  reloadpred = reloaded_model.predict(predset)
#  assert np.all(origpred == reloadpred)
#
#  # Eval model on train
#  scores = reloaded_model.evaluate(dataset, [classification_metric])
#  assert scores[classification_metric.name] > .9

def test_chemception_reload():
  """Test that chemception models can be saved and reloaded."""
  img_size = 80
  img_spec = "engd"
  res = 0.5
  n_tasks = 1
  featurizer = dc.feat.SmilesToImage(
      img_size=img_size, img_spec=img_spec, res=res)

  data_points = 10
  mols = ["CCCCCCCC"] * data_points
  X = featurizer(mols)

  y = np.random.randint(0, 2, size=(data_points, n_tasks))
  w = np.ones(shape=(data_points, n_tasks))
  dataset = dc.data.NumpyDataset(X, y, w, mols)
  classsification_metric = dc.metrics.Metric(
      dc.metrics.roc_auc_score, np.mean, mode="classification")

  model_dir = tempfile.mkdtemp()
  model = dc.models.ChemCeption(
      n_tasks=n_tasks,
      img_spec="engd",
      model_dir=model_dir,
      mode="classification")
  model.fit(dataset, nb_epoch=3)

  # Reload Trained Model
  reloaded_model = dc.models.ChemCeption(
      n_tasks=n_tasks,
      img_spec="engd",
      model_dir=model_dir,
      mode="classification")
  reloaded_model.restore()

  # Check predictions match on random sample
  predmols = ["CCCC", "CCCCCO", "CCCCC"]
  Xpred = featurizer(predmols)
  predset = dc.data.NumpyDataset(Xpred)
  origpred = model.predict(predset)
  reloadpred = reloaded_model.predict(predset)
  assert np.all(origpred == reloadpred)