Commit 31b60db1 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Porting singletask-to-multitask initial

parent cd01f619
Loading
Loading
Loading
Loading
+12 −23
Original line number Diff line number Diff line
@@ -22,8 +22,7 @@ from deepchem.transformers import NormalizationTransformer
from deepchem import metrics
from deepchem.metrics import Metric
from deepchem.models.multitask import SingletaskToMultitask 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.ensemble import RandomForestClassifier
from deepchem.datasets import Dataset
from deepchem.hyperparameters import HyperparamOpt
from deepchem.models.keras_models.fcnet import MultiTaskDNN
@@ -53,7 +52,6 @@ class TestHyperparamOptAPI(TestAPI):
  """
  def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self):
    """Test of hyperparam_opt with singletask RF ECFP regression API."""
    splittype = "scaffold"
    featurizer = CircularFingerprint(size=1024)
    tasks = ["log-solubility"]
    task_type = "regression"
@@ -90,12 +88,9 @@ class TestHyperparamOptAPI(TestAPI):

  def test_singletask_to_multitask_sklearn_hyperparam_opt(self):
    """Test of hyperparam_opt with singletask_to_multitask."""
    splittype = "scaffold"
    output_transformers = []
    tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
             "task7", "task8", "task9", "task10", "task11", "task12",
             "task13", "task14", "task15", "task16"]
    task_types = {task: "classification" for task in tasks}
    input_file = "multitask_example.csv"
      
    n_features = 10
@@ -119,25 +114,20 @@ class TestHyperparamOptAPI(TestAPI):
    valid_dataset = Dataset.from_numpy(self.valid_dir,
                                       X_valid, y_valid, w_valid, ids_valid,
                                       tasks)
    params_dict = {
        "batch_size": [32],
        "data_shape": [train_dataset.get_data_shape()],
    }

    transformers = []
    classification_metric = Metric(metrics.matthews_corrcoef, np.mean,
                                   mode="classification")
    def model_builder(tasks, task_types, model_params, task_model_dir,
                      verbosity=None):
      return SklearnModel(tasks, task_types, model_params, task_model_dir,
                          model_instance=LogisticRegression())
    def multitask_model_builder(tasks, task_types, params_dict, logdir=None,
                                verbosity=None):
      return SingletaskToMultitask(tasks, task_types, params_dict,
                                   self.model_dir, model_builder)

    optimizer = HyperparamOpt(multitask_model_builder, tasks, task_types,
                              verbosity="low")
    params_dict = {"n_estimators": [1, 10]}
    def multitask_model_builder(model_params, model_dir):
      def model_builder(model_dir):
        sklearn_model = RandomForestClassifier(**model_params)
        return SklearnModel(sklearn_model, model_dir)
      return SingletaskToMultitask(tasks, model_builder, model_dir)

    optimizer = HyperparamOpt(multitask_model_builder, verbosity="low")
    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, output_transformers,
      params_dict, train_dataset, valid_dataset, transformers,
      classification_metric, logdir=None)

  def test_multitask_keras_mlp_ECFP_classification_hyperparam_opt(self):
@@ -175,7 +165,6 @@ class TestHyperparamOptAPI(TestAPI):

  def test_multitask_tf_mlp_ECFP_classification_hyperparam_opt(self):
    """Straightforward test of Tensorflow multitask deepchem classification API."""
    splittype = "scaffold"
    task_type = "classification"

    input_file = os.path.join(self.current_dir, "multitask_example.csv")
+11 −50
Original line number Diff line number Diff line
@@ -18,30 +18,21 @@ class SingletaskToMultitask(Model):

  Warning: This current implementation is only functional for sklearn models. 
  """
  def __init__(self, tasks, task_types, model_params, model_dir, model_builder,
               store_in_memory=False, verbosity=None):
  def __init__(self, tasks, model_builder, model_dir, verbosity=None):
    self.tasks = tasks
    self.task_types = task_types
    self.model_params = model_params
    self.models = {}
    self.model_dir = model_dir
    # If models are TF models, they don't use up RAM, so can keep in memory
    self.task_models = {}
    self.task_model_dirs = {}
    self.model_builder = model_builder
    self.verbosity = verbosity
    self.store_in_memory = store_in_memory
    log("About to initialize singletask to multitask model",
        self.verbosity, "high")
    if not os.path.exists(self.model_dir):
      os.makedirs(self.model_dir)
    self.fit_transformers = False
    for task in self.tasks:
      task_type = self.task_types[task]
      task_model_dir = os.path.join(self.model_dir, str(task))
      if not os.path.exists(task_model_dir):
        os.makedirs(task_model_dir)
      log("Initializing model for task %s" % task,
      log("Initializing directory for task %s" % task,
          self.verbosity, "high")
      self.task_model_dirs[task] = task_model_dir

@@ -74,13 +65,9 @@ class SingletaskToMultitask(Model):
    for ind, task in enumerate(self.tasks):
      log("Fitting model for task %s" % task, self.verbosity, "high")
      task_model = self.model_builder(
          [task], {task: self.task_types[task]}, self.model_params,
          self.task_model_dirs[task],
          verbosity=self.verbosity)
          self.task_model_dirs[task])
      task_model.fit(task_datasets[ind])
      task_model.save()
      if self.store_in_memory:
        self.task_models[task] = task_model

  def predict_on_batch(self, X):
    """
@@ -90,14 +77,7 @@ class SingletaskToMultitask(Model):
    n_samples = X.shape[0]
    y_pred = np.zeros((n_samples, n_tasks))
    for ind, task in enumerate(self.tasks):
      task_type = self.task_types[task]
      if self.store_in_memory:
        task_model = self.task_models[task]
      else:
        task_model = self.model_builder(
            [task], {task: self.task_types[task]}, self.model_params,
            self.task_model_dirs[task],
            verbosity=self.verbosity)
      task_model = self.model_builder(self.task_model_dirs[task])
      task_model.reload()

      y_pred[:, ind] = task_model.predict_on_batch(X)
@@ -111,14 +91,7 @@ class SingletaskToMultitask(Model):
    n_samples = len(dataset) 
    y_pred = np.zeros((n_samples, n_tasks))
    for ind, task in enumerate(self.tasks):
      task_type = self.task_types[task]
      if self.store_in_memory:
        task_model = self.task_models[task]
      else:
        task_model = self.model_builder(
            [task], {task: self.task_types[task]}, self.model_params,
            self.task_model_dirs[task],
            verbosity=self.verbosity)
      task_model = self.model_builder(self.task_model_dirs[task])
      task_model.reload()

      y_pred[:, ind] = task_model.predict(dataset, [])
@@ -133,13 +106,7 @@ class SingletaskToMultitask(Model):
    n_samples = X.shape[0]
    y_pred = np.zeros((n_samples, n_tasks, n_classes))
    for ind, task in enumerate(self.tasks):
      if self.store_in_memory:
        task_model = self.task_models[task]
      else:
        task_model = self.model_builder(
            [task], {task: self.task_types[task]}, self.model_params,
            self.task_model_dirs[task],
            verbosity=self.verbosity)
      task_model = self.model_builder(self.task_model_dirs[task])
      task_model.reload()

      y_pred[:, ind] = task_model.predict_proba_on_batch(X)
@@ -153,13 +120,7 @@ class SingletaskToMultitask(Model):
    n_samples = len(dataset) 
    y_pred = np.zeros((n_samples, n_tasks, n_classes))
    for ind, task in enumerate(self.tasks):
      if self.store_in_memory:
        task_model = self.task_models[task]
      else:
        task_model = self.model_builder(
            [task], {task: self.task_types[task]}, self.model_params,
            self.task_model_dirs[task],
            verbosity=self.verbosity)
      task_model = self.model_builder(self.task_model_dirs[task])
      task_model.reload()

      y_pred[:, ind] = np.squeeze(task_model.predict_proba(
+5 −59
Original line number Diff line number Diff line
@@ -84,23 +84,7 @@ class TensorflowGraph(object):


class TensorflowGraphModel(object):
  """Thin wrapper holding a tensorflow graph and a few vars.

  Notes:
  
    batch_size
    penalty
    nb_epoch
    pad_batches
    penalty_type
    optimizer
    learning_rate
    momentum
    data_shape
    layer_sizes
    weight_init_stddevs
    bias_init_consts
    dropouts
  """Parent class for deepchem Tensorflow models.
  
  Classifier:
    n_classes
@@ -117,23 +101,6 @@ class TensorflowGraphModel(object):
    add_output_ops
    add_training_cost 

  Subclasses must set the following attributes:
    loss: Op to calculate training cost used for gradient calculation.
    output: Op(s) for model output for each task.
    labels: Op(s) for true labels for each task.
    weights: Op(s) for example weights for each task.
    updates: Op(s) for running updates of e.g. moving averages for batch
      normalization. Should be set to tf.no_op() if no updates are required.

  This base class provides the following attributes:
    graph: TensorFlow graph object.
    logdir: Path to the file output directory to store checkpoints etc.
    master: TensorFlow session master specification string.
    n_tasks: Integer number of tasks this model trains/evals on.
    placeholder_scope: name scope where tf.placeholders are defined.
    valid: Placeholder for a boolean tensor with shape batch_size to use as a
      mask when calculating gradient costs.

  Args:
    train: If True, model is in training mode.
    logdir: Directory for output files.
@@ -179,12 +146,6 @@ class TensorflowGraphModel(object):

    self.train_graph = self.construct_graph(training=True)
    self.eval_graph = self.construct_graph(training=False)
    ######################################################## DEBUG
    print("self.train_graph.output")
    print(self.train_graph.output)
    print("self.eval_graph.output")
    print(self.eval_graph.output)
    ######################################################## DEBUG


  def construct_graph(self, training):
@@ -202,10 +163,6 @@ class TensorflowGraphModel(object):
    with graph.as_default():
      output = self.build(graph, name_scopes, training)
      labels = self.add_label_placeholders(graph, name_scopes)
      ####################################################### DEBUG
      print("labels")
      print(labels)
      ####################################################### DEBUG
      weights = self.add_example_weight_placeholders(graph, name_scopes)

    if training:
@@ -256,9 +213,6 @@ class TensorflowGraphModel(object):
            penalty = model_ops.WeightDecay(self.penalty_type, self.penalty)
            loss += penalty

      ############################################################ DEBUG
      #return weighted_costs
      ############################################################ DEBUG
      return loss 

  def fit(self, dataset, nb_epoch=10, pad_batches=False, shuffle=False,
@@ -299,10 +253,6 @@ class TensorflowGraphModel(object):
              log("On batch %d" % ind, self.verbosity)
            # Run training op.
            feed_dict = self.construct_feed_dict(X_b, y_b, w_b, ids_b)
            ######################################################## DEBUG
            print("feed_dict.keys()")
            print(feed_dict.keys())
            ######################################################## DEBUG
            fetches = self.train_graph.output + [
                train_op, self.train_graph.loss]
            fetched_values = sess.run(
@@ -428,8 +378,8 @@ class TensorflowGraphModel(object):
    """
    weights = []
    placeholder_scope = TensorflowGraph.get_placeholder_scope(graph, name_scopes)
    for task in xrange(self.n_tasks):
    with placeholder_scope:
      for task in xrange(self.n_tasks):
        weights.append(tf.identity(
            tf.placeholder(tf.float32, shape=[None],
                           name='weights_%d' % task)))
@@ -555,8 +505,8 @@ class TensorflowClassifier(TensorflowGraphModel):
      batch_size = self.batch_size 
      n_classes = self.n_classes
      labels = []
      for task in xrange(self.n_tasks):
      with placeholder_scope:
        for task in xrange(self.n_tasks):
          labels.append(tf.identity(
              tf.placeholder(tf.float32, shape=[None, n_classes],
                             name='labels_%d' % task)))
@@ -589,10 +539,6 @@ class TensorflowClassifier(TensorflowGraphModel):
      outputs = []
      with self._get_shared_session(train=False).as_default():
        feed_dict = self.construct_feed_dict(X)
        ################################################### DEBUG
        print("feed_dict.keys()")
        print(feed_dict.keys())
        ################################################### DEBUG
        data = self._get_shared_session(train=False).run(
            self.eval_graph.output, feed_dict=feed_dict)
        batch_outputs = np.asarray(data[:n_tasks], dtype=float)