Commit ce4a5ca7 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Ported overfit tests

parent 31b60db1
Loading
Loading
Loading
Loading
+8 −7
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ from deepchem.metrics import from_one_hot
from deepchem.models.tensorflow_models import model_ops
from deepchem.models.tensorflow_models import utils as tf_utils
from deepchem.utils.save import log
from deepchem.datasets import pad_features

def softmax(x):
  """Simple numpy softmax implementation
@@ -574,9 +575,9 @@ class TensorflowRegressor(TensorflowGraphModel):
  def get_task_type(self):
    return "regressor"

  def add_output_ops(self, graph):
  def add_output_ops(self, graph, output):
    """No-op for regression models since no softmax."""
    pass
    return output

  def cost(self, output, labels, weights):
    """Calculate single-task training cost for a batch of examples.
@@ -605,8 +606,8 @@ class TensorflowRegressor(TensorflowGraphModel):
    with graph.as_default():
      batch_size = self.batch_size
      labels = []
      with placeholder_scope:
        for task in xrange(self.n_tasks):
        with tf.name_scope(placeholder_scope):
          labels.append(tf.identity(
              tf.placeholder(tf.float32, shape=[None],
                             name='labels_%d' % task)))
@@ -647,7 +648,7 @@ class TensorflowRegressor(TensorflowGraphModel):
        X = pad_features(self.batch_size, X)
        feed_dict = self.construct_feed_dict(X)
        data = self._get_shared_session(train=False).run(
            self.output, feed_dict=feed_dict)
            self.eval_graph.output, feed_dict=feed_dict)
        batch_outputs = np.asarray(data[:n_tasks], dtype=float)
        # reshape to batch_size x n_tasks x ...
        if batch_outputs.ndim == 3:
@@ -683,11 +684,11 @@ class TensorflowModel(Model):
    self.model_instance = model
    self.fit_transformers = None

  def fit(self, dataset, shuffle=False):
  def fit(self, dataset, **kwargs):
    """
    Fits TensorflowGraph to data.
    """
    self.model_instance.fit(dataset, shuffle=shuffle)
    self.model_instance.fit(dataset, **kwargs)

  def predict_on_batch(self, X):
    """
+3 −4
Original line number Diff line number Diff line
@@ -14,7 +14,6 @@ from deepchem.models.tensorflow_models import TensorflowClassifier
from deepchem.models.tensorflow_models import TensorflowRegressor
from deepchem.models.tensorflow_models import model_ops
from deepchem.metrics import to_one_hot
from deepchem.datasets import pad_features

class TensorflowMultiTaskClassifier(TensorflowClassifier):
  """Implements an icml model as configured in a model_config.proto."""
@@ -108,11 +107,11 @@ class TensorflowMultiTaskRegressor(TensorflowRegressor):
      mol_features: Molecule descriptor (e.g. fingerprint) tensor with shape
        batch_size x n_features.
    """
    n_features = self.n_inputs
    n_features = self.n_features
    placeholder_scope = TensorflowGraph.get_placeholder_scope(
        graph, name_scopes)
    with graph.as_default():
      with tf.name_scope(placeholder_scope):
      with placeholder_scope:
        self.mol_features = tf.placeholder(
            tf.float32,
            shape=[None, n_features],
@@ -149,7 +148,7 @@ class TensorflowMultiTaskRegressor(TensorflowRegressor):

      output = []
      for task in range(self.n_tasks):
        self.output.append(tf.squeeze(
        output.append(tf.squeeze(
            model_ops.FullyConnectedLayer(
                tensor=prev_layer,
                size=layer_sizes[i],
+0 −15
Original line number Diff line number Diff line
@@ -25,25 +25,10 @@ class TestMultitaskData(TestAPI):
  """
  def test_multitask_order(self):
    """Test that order of tasks in multitask datasets is preserved."""
    from deepchem.models.keras_models.fcnet import MultiTaskDNN
    splittype = "scaffold"
    output_transformers = []
    input_transformers = []
    task_type = "classification"
    # TODO(rbharath): There should be some automatic check to ensure that all
    # required model_params are specified.
    model_params = {"nb_hidden": 10, "activation": "relu",
                    "dropout": .5, "learning_rate": .01,
                    "momentum": .9, "nesterov": False,
                    "decay": 1e-4, "batch_size": 5,
                    "nb_epoch": 2, "init": "glorot_uniform",
                    "nb_layers": 1, "batchnorm": False}

    input_file = os.path.join(self.current_dir, "multitask_example.csv")
    tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
             "task7", "task8", "task9", "task10", "task11", "task12",
             "task13", "task14", "task15", "task16"]
    task_types = {task: task_type for task in tasks}

    featurizer = CircularFingerprint(size=1024)

+107 −328

File changed.

Preview size limit exceeded, changes collapsed.

+14 −28
Original line number Diff line number Diff line
@@ -25,27 +25,17 @@ class TestSingletasktoMultitaskAPI(TestAPI):
  Test top-level API for singletask_to_multitask ML models.
  """
  def test_singletask_to_multitask_classification(self):
    splittype = "scaffold"
    compound_featurizers = [CircularFingerprint(size=1024)]
    complex_featurizers = []
    output_transformers = []
    tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
             "task7", "task8", "task9", "task10", "task11", "task12",
             "task13", "task14", "task15", "task16"]
    task_types = {task: "classification" for task in tasks}
    input_file = "multitask_example.csv"

    n_features = 10
    n_tasks = len(tasks)
    n_tasks = 17
    tasks = range(n_tasks)
    # Define train dataset
    n_train = 100
    X_train = np.random.rand(n_train, n_features)
    y_train = np.random.randint(2, size=(n_train, n_tasks))
    w_train = np.ones_like(y_train)
    ids_train = ["C"] * n_train
    train_dataset = Dataset.from_numpy(self.train_dir,
                                       X_train, y_train, w_train, ids_train,
                                       tasks)
    train_dataset = Dataset.from_numpy(
        self.train_dir, X_train, y_train, w_train, ids_train)

    # Define test dataset
    n_test = 10
@@ -53,31 +43,27 @@ class TestSingletasktoMultitaskAPI(TestAPI):
    y_test = np.random.randint(2, size=(n_test, n_tasks))
    w_test = np.ones_like(y_test)
    ids_test = ["C"] * n_test
    test_dataset = Dataset.from_numpy(self.test_dir,
                                      X_test, y_test, w_test, ids_test,
                                      tasks)
    test_dataset = Dataset.from_numpy(
        self.test_dir, X_test, y_test, w_test, ids_test)

    params_dict = {
        "batch_size": 32,
        "data_shape": train_dataset.get_data_shape()
    }
    transformers = []
    classification_metrics = [Metric(metrics.roc_auc_score)]
    def model_builder(tasks, task_types, model_params, model_builder, verbosity=None):
      return SklearnModel(tasks, task_types, model_params, model_builder,
                          model_instance=LogisticRegression())
    multitask_model = SingletaskToMultitask(tasks, task_types, params_dict,
                                            self.model_dir, model_builder)
    def model_builder(model_dir):
      sklearn_model = LogisticRegression()
      return SklearnModel(sklearn_model, model_dir)
    multitask_model = SingletaskToMultitask(
        tasks, model_builder, self.model_dir)

    # Fit trained model
    multitask_model.fit(train_dataset)
    multitask_model.save()

    # Eval multitask_model on train
    evaluator = Evaluator(multitask_model, train_dataset, output_transformers,
    evaluator = Evaluator(multitask_model, train_dataset, transformers,
                          verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)

    # Eval multitask_model on test
    evaluator = Evaluator(multitask_model, test_dataset, output_transformers,
    evaluator = Evaluator(multitask_model, test_dataset, transformers,
                          verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)