Commit ee2bc2a8 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

More test refactorings

parent 4a1b7527
Loading
Loading
Loading
Loading
+23 −53
Original line number Diff line number Diff line
@@ -16,71 +16,41 @@ import shutil
from deepchem.splits import RandomSplitter
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.coulomb_matrices import CoulombMatrixEig
from deepchem.models.tests import TestAPI

class TestFeaturizedSamples(unittest.TestCase):
class TestFeaturizedSamples(TestAPI):
  """
  Test Featurized Samples class.
  """
  def setUp(self):
    self.current_dir = os.path.dirname(os.path.abspath(__file__))
    self.smiles_field = "smiles"
    self.mol_field = "mol"
    self.feature_dir = tempfile.mkdtemp()
    self.samples_dir = tempfile.mkdtemp()
    self.train_dir = tempfile.mkdtemp()
    self.valid_dir = tempfile.mkdtemp()
    self.test_dir = tempfile.mkdtemp()

  def _featurize_train_valid_test_split(self, splittype, input_file, tasks,
                                        frac_train, frac_valid, frac_test):
    # Featurize input
    compound_featurizers = [CoulombMatrixEig(6, remove_hydrogens=False)]
    complex_featurizers = []
    featurizers = compound_featurizers + complex_featurizers
  def random_test_train_valid_test_split_from_sdf(self):
    """Test of singletask CoulombMatrixEig regression on .sdf file."""
    splittype = "random"
    input_transforms = []
    output_transforms = ["normalize"]
    model_params = {}
    tasks = ["atomization_energy"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    current_dir = os.path.dirname(os.path.abspath(__file__))
    input_file = os.path.join(current_dir, "data/water.sdf")

    featurizers = [CoulombMatrixEig(6, remove_hydrogens=False)]

    input_file = os.path.join(self.current_dir, input_file)
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                mol_field=self.mol_field,
                                compound_featurizers=compound_featurizers,
                                complex_featurizers=complex_featurizers,
                                mol_field="mol",
                                featurizers=featurizers,
                                verbosity="low")

    #Featurizes samples and transforms them into NumPy arrays suitable for ML.
    #returns an instance of class FeaturizedSamples()

    samples = featurizer.featurize(input_file, self.feature_dir, self.samples_dir)
    dataset = featurizer.featurize(input_file, self.data_dir)

    # Splits featurized samples into train/test
    splitter = RandomSplitter()
    if frac_valid > 0:
      train_samples, valid_samples, test_samples = splitter.train_valid_test_split(
          samples, train_dir=self.train_dir, valid_dir=self.valid_dir,
          test_dir=self.test_dir, frac_train=frac_train,
          frac_valid=frac_valid, frac_test=frac_test)

      return train_samples, valid_samples, test_samples
    else:
      train_samples, test_samples = splitter.train_test_split(
          samples, train_dir=self.train_dir, test_dir=self.test_dir,
          frac_train=frac_train)
      return train_samples, test_samples

  def random_test_train_valid_test_split_from_sdf(self):
    """Test of singletask CoulombMatrixEig RF regression API when reading from .sdf file."""
    splittype = "random"
    input_transforms = []
    output_transforms = ["normalize"]
    model_params = {}
    tasks = ["atomization_energy"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = "data/water.sdf"
    train_samples, valid_samples, test_samples = (
        self._featurize_train_valid_test_split(
            splittype, input_file, tasks, frac_train=.8,
            frac_valid=.1, frac_test=.1))
    assert len(train_samples) == 8
    assert len(valid_samples) == 1
    assert len(test_samples) == 1
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)
    assert len(train_dataset) == 8
    assert len(valid_dataset) == 1
    assert len(test_dataset) == 1
+47 −30
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ import numpy as np
from deepchem.models.tests import TestAPI
from deepchem.models.sklearn_models import SklearnModel
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.transformers import NormalizationTransformer
from deepchem import metrics
from deepchem.metrics import Metric
@@ -28,6 +29,7 @@ from deepchem.hyperparameters import HyperparamOpt
from deepchem.models.keras_models.fcnet import MultiTaskDNN
from deepchem.models.tensorflow_models import TensorflowModel
from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskClassifier
from deepchem.splits import ScaffoldSplitter

def rf_model_builder(tasks, task_types, params_dict, model_dir, verbosity=None):
    """Builds random forests given hyperparameters.
@@ -49,19 +51,28 @@ class TestHyperparamOptAPI(TestAPI):
  def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self):
    """Test of hyperparam_opt with singletask RF ECFP regression API."""
    splittype = "scaffold"
    compound_featurizers = [CircularFingerprint(size=1024)]
    complex_featurizers = []
    input_transformer_classes = []
    output_transformer_classes = [NormalizationTransformer]
    featurizers = [CircularFingerprint(size=1024)]
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = "example.csv"
    train_dataset, valid_dataset, _, output_transformers, = \
        self._featurize_train_test_split(
            splittype, compound_featurizers, 
            complex_featurizers, input_transformer_classes,
            output_transformer_classes, input_file, tasks)
    input_file = os.path.join(self.current_dir, "example.csv")
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                featurizers=featurizers,
                                verbosity="low")
    dataset = featurizer.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)

    input_transformers = []
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)
    params_dict = {
      "n_estimators": [10, 100],
      "max_features": ["auto"],
@@ -130,24 +141,25 @@ class TestHyperparamOptAPI(TestAPI):

  def test_multitask_keras_mlp_ECFP_classification_hyperparam_opt(self):
    """Straightforward test of Keras multitask deepchem classification API."""
    splittype = "scaffold"
    output_transformers = []
    input_transformers = []
    task_type = "classification"

    input_file = os.path.join(self.current_dir, "multitask_example.csv")
    tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
             "task7", "task8", "task9", "task10", "task11", "task12",
             "task13", "task14", "task15", "task16"]
    task_types = {task: task_type for task in tasks}

    compound_featurizers = [CircularFingerprint(size=1024)]
    complex_featurizers = []
    featurizers = [CircularFingerprint(size=1024)]
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                featurizers=featurizers,
                                verbosity="low")
    dataset = featurizer.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)

    train_dataset, valid_dataset, _, transformers = self._featurize_train_test_split(
        splittype, compound_featurizers, 
        complex_featurizers, input_transformers,
        output_transformers, input_file, tasks)
    transformers = []
    metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification")
    params_dict= {"nb_hidden": [5, 10],
                  "activation": ["relu"],
@@ -166,14 +178,12 @@ class TestHyperparamOptAPI(TestAPI):
    optimizer = HyperparamOpt(MultiTaskDNN, tasks, task_types,
                              verbosity="low")
    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, output_transformers,
      params_dict, train_dataset, valid_dataset, transformers,
      metric, logdir=None)

  def test_multitask_tf_mlp_ECFP_classification_hyperparam_opt(self):
    """Straightforward test of Tensorflow multitask deepchem classification API."""
    splittype = "scaffold"
    output_transformers = []
    input_transformers = []
    task_type = "classification"

    input_file = os.path.join(self.current_dir, "multitask_example.csv")
@@ -182,13 +192,20 @@ class TestHyperparamOptAPI(TestAPI):
             "task13", "task14", "task15", "task16"]
    task_types = {task: task_type for task in tasks}

    compound_featurizers = [CircularFingerprint(size=1024)]
    complex_featurizers = []
    featurizers = [CircularFingerprint(size=1024)]

    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                featurizers=featurizers,
                                verbosity="low")
    dataset = featurizer.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)

    transformers = []

    train_dataset, valid_dataset, _, transformers = self._featurize_train_test_split(
        splittype, compound_featurizers, 
        complex_featurizers, input_transformers,
        output_transformers, input_file, tasks)
    metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification")
    params_dict = {"activation": ["relu"],
                    "momentum": [.9],
@@ -220,5 +237,5 @@ class TestHyperparamOptAPI(TestAPI):
    optimizer = HyperparamOpt(model_builder, tasks, task_types,
                              verbosity="low")
    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, output_transformers,
      params_dict, train_dataset, valid_dataset, transformers,
      metric, logdir=None)
+33 −2
Original line number Diff line number Diff line
@@ -162,15 +162,46 @@ class Model(object):
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size):
      y_pred_batch = self.predict_proba_on_batch(X_batch)
      batch_size = len(y_batch)
      y_pred_batch = np.squeeze(
          np.reshape(y_pred_batch, (batch_size, n_tasks, n_classes)))
      ######################################################### DEBUG
      #print("Model.predict_proba()")
      #print("y_pred_batch.shape")
      #print(y_pred_batch.shape)
      ######################################################### DEBUG
      #y_pred_batch = np.squeeze(
      #    np.reshape(y_pred_batch, (batch_size, n_tasks, n_classes)))
      y_pred_batch = np.reshape(y_pred_batch, (batch_size, n_tasks, n_classes))
      ######################################################### DEBUG
      #print("reshape")
      #print("y_pred_batch.shape")
      #print(y_pred_batch.shape)
      ######################################################### DEBUG
      y_pred_batch = undo_transforms(y_pred_batch, transformers)
      ######################################################### DEBUG
      #print("untransformed")
      #print("y_pred_batch.shape")
      #print(y_pred_batch.shape)
      ######################################################### DEBUG
      y_preds.append(y_pred_batch)
    ######################################################### DEBUG
    #print("[y_pred.shape for y_pred in y_preds]")
    #print([y_pred.shape for y_pred in y_preds])
    ######################################################### DEBUG
    y_pred = np.vstack(y_preds)
    ######################################################### DEBUG
    #print("y_pred.shape")
    #print(y_pred.shape)
    ######################################################### DEBUG
    # The iterbatches does padding with zero-weight examples on the last batch.
    # Remove padded examples.
    n_samples, n_tasks = len(dataset), len(self.tasks)
    y_pred = y_pred[:n_samples]
    ######################################################### DEBUG
    #print("Model.predict_proba()")
    #print("n_samples, y_pred.shape, y_batch.shape")
    #print(n_samples, y_pred.shape, y_batch.shape)
    #print("(n_samples, n_tasks, n_classes)")
    #print((n_samples, n_tasks, n_classes))
    ######################################################### DEBUG
    y_pred = np.reshape(y_pred, (n_samples, n_tasks, n_classes))
    return y_pred

+48 −48
Original line number Diff line number Diff line
@@ -53,57 +53,57 @@ class TestAPI(unittest.TestCase):
    # debug.
    #shutil.rmtree(self.model_dir)

  def _featurize_train_test_split(self, splittype, featurizers, 
                                  input_transformer_classes,
                                  output_transformer_classes, input_file, tasks, 
                                  protein_pdb_field=None, ligand_pdb_field=None,
                                  user_specified_features=None,
                                  split_field=None,
                                  shard_size=100):
    # Featurize input
    input_file = os.path.join(self.current_dir, input_file)
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                protein_pdb_field=protein_pdb_field,
                                ligand_pdb_field=ligand_pdb_field,
                                featurizers=featurizers,
                                user_specified_features=user_specified_features,
                                split_field=split_field,
                                verbosity="low")
  #def _featurize_train_test_split(self, splittype, featurizers, 
  #                                input_transformer_classes,
  #                                output_transformer_classes, input_file, tasks, 
  #                                protein_pdb_field=None, ligand_pdb_field=None,
  #                                user_specified_features=None,
  #                                split_field=None,
  #                                shard_size=100):
  #  # Featurize input
  #  input_file = os.path.join(self.current_dir, input_file)
  #  featurizer = DataFeaturizer(tasks=tasks,
  #                              smiles_field=self.smiles_field,
  #                              protein_pdb_field=protein_pdb_field,
  #                              ligand_pdb_field=ligand_pdb_field,
  #                              featurizers=featurizers,
  #                              user_specified_features=user_specified_features,
  #                              split_field=split_field,
  #                              verbosity="low")
  #  

  #  #Featurizes samples and transforms them into NumPy arrays suitable for ML.
  #  #returns an instance of class FeaturizedSamples()

    #Featurizes samples and transforms them into NumPy arrays suitable for ML.
    #returns an instance of class FeaturizedSamples()
  #  dataset = featurizer.featurize(input_file, self.data_dir, shard_size=shard_size)

    dataset = featurizer.featurize(input_file, self.data_dir, shard_size=shard_size)
  #  # Splits featurized samples into train/test
  #  assert splittype in ["random", "specified", "scaffold"]
  #  if splittype == "random":
  #    splitter = RandomSplitter()
  #  elif splittype == "specified":
  #    splitter = SpecifiedSplitter()
  #  elif splittype == "scaffold":
  #    splitter = ScaffoldSplitter()
  #  train_dataset, test_dataset = splitter.train_test_split(
  #      samples, self.train_dir, self.test_dir)

    # Splits featurized samples into train/test
    assert splittype in ["random", "specified", "scaffold"]
    if splittype == "random":
      splitter = RandomSplitter()
    elif splittype == "specified":
      splitter = SpecifiedSplitter()
    elif splittype == "scaffold":
      splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        samples, self.train_dir, self.test_dir)
  #  # Initialize transformers
  #  input_transformers = []
  #  for transform_class in input_transformer_classes:
  #    input_transformers.append(transform_class(
  #        transform_X=True, dataset=train_dataset))
  #  output_transformers = []
  #  for transform_class in output_transformer_classes:
  #    output_transformers.append(transform_class(
  #        transform_y=True, dataset=train_dataset))
  #  transformers = input_transformers + output_transformers

    # Initialize transformers
    input_transformers = []
    for transform_class in input_transformer_classes:
      input_transformers.append(transform_class(
          transform_X=True, dataset=train_dataset))
    output_transformers = []
    for transform_class in output_transformer_classes:
      output_transformers.append(transform_class(
          transform_y=True, dataset=train_dataset))
    transformers = input_transformers + output_transformers
  #  # Transforming train data
  #  for transformer in transformers:
  #    transformer.transform(train_dataset)
  #  # Transforming test data
  #  for transformer in transformers:
  #    transformer.transform(test_dataset)

    # Transforming train data
    for transformer in transformers:
      transformer.transform(train_dataset)
    # Transforming test data
    for transformer in transformers:
      transformer.transform(test_dataset)

    return train_dataset, test_dataset, input_transformers, output_transformers
  #  return train_dataset, test_dataset, input_transformers, output_transformers
+25 −19
Original line number Diff line number Diff line
@@ -42,17 +42,25 @@ class TestModelAPI(TestAPI):
    """Test of singletask RF ECFP regression API."""
    splittype = "scaffold"
    featurizers = [CircularFingerprint(size=1024)]
    input_transformers = []
    output_transformers = [NormalizationTransformer]
    model_params = {}
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = "example.csv"
    train_dataset, test_dataset, _, transformers, = self._featurize_train_test_split(
        splittype, featurizers, 
        complex_featurizers, input_transformers,
        output_transformers, input_file, tasks)
    input_file = os.path.join(self.current_dir, "example.csv")
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                featurizers=featurizers,
                                verbosity="low")
    dataset = featurizer.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)

    input_transformers = []
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    model_params["data_shape"] = train_dataset.get_data_shape()
    regression_metrics = [Metric(metrics.r2_score),
                          Metric(metrics.mean_squared_error),
@@ -142,9 +150,6 @@ class TestModelAPI(TestAPI):
    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)
    #train_dataset, test_dataset, _, transformers = self._featurize_train_test_split(
    #    splittype, featurizers, input_transformers, output_transformers,
    #    input_file, tasks, shard_size=50)
    input_transformers = []
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
@@ -204,9 +209,6 @@ class TestModelAPI(TestAPI):
      for transformer in transformers:
        transformer.transform(dataset)

    #train_dataset, test_dataset, _, transformers = self._featurize_train_test_split(
    #    splittype, featurizers, input_transformers, output_transformers,
    #    input_file, tasks)
    model_params["data_shape"] = train_dataset.get_data_shape()
    regression_metrics = [Metric(metrics.r2_score),
                          Metric(metrics.mean_squared_error),
@@ -287,9 +289,6 @@ class TestModelAPI(TestAPI):
  def test_multitask_keras_mlp_ECFP_classification_API(self):
    """Straightforward test of Keras multitask deepchem classification API."""
    from deepchem.models.keras_models.fcnet import MultiTaskDNN
    splittype = "scaffold"
    output_transformers = []
    input_transformers = []
    task_type = "classification"
    # TODO(rbharath): There should be some automatic check to ensure that all
    # required model_params are specified.
@@ -308,9 +307,16 @@ class TestModelAPI(TestAPI):

    featurizers = [CircularFingerprint(size=1024)]

    train_dataset, test_dataset, _, transformers = self._featurize_train_test_split(
        splittype, featurizers, input_transformers,
        output_transformers, input_file, tasks)
    data_featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                featurizers=featurizers,
                                verbosity="low")
    dataset = data_featurizer.featurize(input_file, self.data_dir)
    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)

    transformers = []
    model_params["data_shape"] = train_dataset.get_data_shape()
    classification_metrics = [Metric(metrics.roc_auc_score),
                              Metric(metrics.matthews_corrcoef),
Loading