Commit f201e042 authored by Joseph Gomes's avatar Joseph Gomes
Browse files

Update FitTransformRegressor tests and example

parent 4dc5e97c
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -15,7 +15,6 @@ from deepchem.models.multitask import SingletaskToMultitask
from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskRegressor
from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskClassifier
from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskFitTransformRegressor
from deepchem.models.tensorflow_models.fcnet import TensorflowCoulombMatrixRegressor
from deepchem.models.tensorflow_models.robust_multitask import RobustMultitaskRegressor
from deepchem.models.tensorflow_models.robust_multitask import RobustMultitaskClassifier
from deepchem.models.tensorflow_models.lr import TensorflowLogisticRegression
+73 −161
Original line number Diff line number Diff line
@@ -190,21 +190,85 @@ class TensorflowMultiTaskRegressor(TensorflowRegressor):
    return TensorflowGraph.get_feed_dict(orig_dict)

class TensorflowMultiTaskFitTransformRegressor(TensorflowMultiTaskRegressor):
  """Implements a TensorflowMultiTaskRegressor that performs on-the-fly transformation during fit/predict"""
  """Implements a TensorflowMultiTaskRegressor that performs on-the-fly transformation during fit/predict

  Example:

  >>> n_samples = 10
  >>> n_features = 3
  >>> n_tasks = 1
  >>> ids = np.arange(n_samples)
  >>> X = np.random.rand(n_samples, n_features, n_features)
  >>> y = np.zeros((n_samples, n_tasks))
  >>> w = np.ones((n_samples, n_tasks))
  >>> dataset = dc.data.NumpyDataset(X, y, w, ids)
  >>> fit_transformers = [dc.trans.CoulombFitTransformer(dataset)]
  >>> model = dc.models.TensorflowMultiTaskFitTransformRegressor(
  >>>     n_tasks, [n_features, n_features], dropouts=[0.],
  >>>     learning_rate=0.003, weight_init_stddevs=[np.sqrt(6)/np.sqrt(1000)],
  >>>     batch_size=n_samples, fit_transformers=fit_transformers, n_evals=1)

  """

  def __init__(self, n_tasks, n_features, logdir=None, layer_sizes=[1000],
               weight_init_stddevs=[.02], bias_init_consts=[1.], penalty=0.0,
               penalty_type="l2", dropouts=[0.5], learning_rate=0.002,
               momentum=.8, optimizer="adam", batch_size=50, n_classes=2,
               momentum=.8, optimizer="adam", batch_size=50,
               fit_transformers=[], n_evals=1, verbose=True, seed=None, **kwargs):

    """Initialize TensorflowMultiTaskFitTransformRegressor
       
    Parameters
    ----------
    n_tasks: int
      Number of tasks
    n_features: int
      Number of features.
    logdir: str
      Location to save data
    layer_sizes: list
      List of layer sizes.
    weight_init_stddevs: list
      List of standard deviations for weights (sampled from zero-mean
      gaussians). One for each layer.
    bias_init_consts: list
      List of bias initializations. One for each layer.
    penalty: float
      Amount of penalty (l2 or l1 applied)
    penalty_type: str
      Either "l2" or "l1"
    dropouts: list
      List of dropout amounts. One for each layer.
    learning_rate: float
      Learning rate for model.
    momentum: float
      Momentum. Only applied if optimizer=="momentum"
    optimizer: str
      Type of optimizer applied.
    batch_size: int
      Size of minibatches for training.
    fit_transformers: list
      List of dc.trans.FitTransformer objects
    n_evals: int
      Number of evalations per example at predict time
    verbose: True 
      Perform logging.
    seed: int
      If not none, is used as random seed for tensorflow.        

    """

    self.fit_transformers = fit_transformers
    self.n_evals = n_evals

    # Run fit transformers on dummy dataset to determine n_features after transformation
    # JSG This could be generalized by passing in init_data_shape rather than n_features
    # JSG for now this only works with full CoulombMatrix featurizer
    X_b = np.random.rand(batch_size, n_features, n_features)
    if isinstance(n_features, list):
      X_b = np.ones([batch_size]+n_features)
    elif isinstance(n_features, int):
      X_b = np.ones([batch_size, n_features])
    else:
      raise ValueError("n_features should be list or int")

    for transformer in self.fit_transformers:
      X_b = transformer.X_transform(X_b)
    n_features = X_b.shape[1]
@@ -215,11 +279,11 @@ class TensorflowMultiTaskFitTransformRegressor(TensorflowMultiTaskRegressor):
	       bias_init_consts=bias_init_consts, penalty=penalty, 
	       penalty_type=penalty_type, dropouts=dropouts, 
	       learning_rate=learning_rate, momentum=momentum, optimizer=optimizer, 
	       batch_size=batch_size, n_classes=n_classes, pad_batches=False, verbose=verbose, seed=seed, 
	       batch_size=batch_size, pad_batches=False, verbose=verbose, seed=seed, 
	       **kwargs)

  def fit(self, dataset, nb_epoch=10, max_checkpoints_to_keep=5, log_every_N_batches=50, **kwargs):
    """Fit the model.
    """Perform fit transformations on each minibatch. Fit the model.

    Parameters
    ---------- 
@@ -281,7 +345,8 @@ class TensorflowMultiTaskFitTransformRegressor(TensorflowMultiTaskRegressor):
    ############################################################## TIMING

  def predict_on_batch(self, X):
    """Return model output for the provided input.
    """Return model output for the provided input. Each example is evaluated
        self.n_evals times.

    Restore(checkpoint) must have previously been called on this object.

@@ -355,156 +420,3 @@ class TensorflowMultiTaskFitTransformRegressor(TensorflowMultiTaskRegressor):
    else:
      outputs = np.reshape(outputs, (1,))
      return outputs

class TensorflowCoulombMatrixRegressor(TensorflowMultiTaskFitTransformRegressor):
  """Implements a TensorflowMultiTaskRegressor that performs on-the-fly transformation during fit/predict"""

  def __init__(self, n_tasks, n_features, logdir=None, layer_sizes=[1000],
               weight_init_stddevs=[.02], bias_init_consts=[1.], penalty=0.0,
               penalty_type="l2", dropouts=[0.5], learning_rate={0: 0.001},
               momentum=.9, optimizer="adam", batch_size=50, n_classes=2,
               fit_transformers=[], n_evals=1, verbose=True, seed=None, **kwargs):

    # Learning rate is set by a dictionary in this class (experimental feature)
    # Initialize the learning rate to the first value in the dictionary
    if isinstance(learning_rate, dict):
      self.learning_rate_schedule = True
      self.lr = learning_rate
      self.learning_rate = self.lr[self.lr.keys()[0]]
    else:
      self.learning_rate_schedule = False
      self.learning_rate = learning_rate

    TensorflowMultiTaskFitTransformRegressor.__init__(self, n_tasks, n_features, logdir=logdir,
               layer_sizes=layer_sizes, weight_init_stddevs=weight_init_stddevs,
               bias_init_consts=bias_init_consts, penalty=penalty,
               penalty_type=penalty_type, dropouts=dropouts,
               learning_rate=self.learning_rate, momentum=momentum, optimizer=optimizer,
               batch_size=batch_size, n_classes=n_classes, fit_transformers=fit_transformers, n_evals=n_evals, 
               verbose=verbose, seed=seed, **kwargs)

  def build(self, graph, name_scopes, training):
    """Constructs the graph architecture as specified in its config.

    This method creates the following Placeholders:
      mol_features: Molecule descriptor (e.g. fingerprint) tensor with shape
        batch_size x n_features.
    """
    n_features = self.n_features
    placeholder_scope = TensorflowGraph.get_placeholder_scope(
        graph, name_scopes)
    with graph.as_default():
      with placeholder_scope:
        self.mol_features = tf.placeholder(
            tf.float32,
            shape=[None, n_features],
            name='mol_features')

      layer_sizes = self.layer_sizes
      weight_init_stddevs = self.weight_init_stddevs
      bias_init_consts = self.bias_init_consts
      dropouts = self.dropouts
      lengths_set = {
          len(layer_sizes),
          len(weight_init_stddevs),
          len(bias_init_consts),
          len(dropouts),
          }
      assert len(lengths_set) == 1, 'All layer params must have same length.'
      n_layers = lengths_set.pop()
      assert n_layers > 0, 'Must have some layers defined.'

      prev_layer = self.mol_features
      prev_layer_size = n_features 
      for i in range(n_layers):
        layer = tf.sigmoid(model_ops.fully_connected_layer(
            tensor=prev_layer,
            size=layer_sizes[i],
            weight_init=tf.truncated_normal(
                shape=[prev_layer_size, layer_sizes[i]],
                stddev=weight_init_stddevs[i]),
            bias_init=tf.constant(value=bias_init_consts[i],
                                  shape=[layer_sizes[i]])))
        layer = model_ops.dropout(layer, dropouts[i], training)
        prev_layer = layer
        prev_layer_size = layer_sizes[i]

      output = []
      for task in range(self.n_tasks):
        output.append(tf.squeeze(
            model_ops.fully_connected_layer(
                tensor=prev_layer,
                size=layer_sizes[i],
                weight_init=tf.truncated_normal(
                    shape=[prev_layer_size, 1],
                    stddev=weight_init_stddevs[i]),
                bias_init=tf.constant(value=bias_init_consts[i],
                                      shape=[1]))))
      return output

  def fit(self, dataset, nb_epoch=10, max_checkpoints_to_keep=5, log_every_N_batches=50, **kwargs):
    """Fit the model.

    Parameters
    ---------- 
    dataset: dc.data.Dataset
      Dataset object holding training data 
    nb_epoch: 10
      Number of training epochs.
    max_checkpoints_to_keep: int
      Maximum number of checkpoints to keep; older checkpoints will be deleted.
    log_every_N_batches: int
      Report every N batches. Useful for training on very large datasets,
      where epochs can take long time to finish.

    Raises
    ------
    AssertionError
      If model is not in training mode.
    """
    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING
    log("Training for %d epochs" % nb_epoch, self.verbose)
    with self.train_graph.graph.as_default():
      train_op = self.get_training_op(
          self.train_graph.graph, self.train_graph.loss)
      with self._get_shared_session(train=True) as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep)
        # Save an initial checkpoint.
        saver.save(sess, self._save_path, global_step=0)
        for epoch in range(nb_epoch):
          avg_loss, n_batches = 0., 0

          if self.learning_rate_schedule:
            for lr_epoch, lr_value in self.lr.items():
               if epoch > lr_epoch: self.learning_rate = lr_value

          for ind, (X_b, y_b, w_b, ids_b) in enumerate(
              dataset.iterbatches(self.batch_size, pad_batches=self.pad_batches)):
            if ind % log_every_N_batches == 0:
              log("On batch %d" % ind, self.verbose)
            for transformer in self.fit_transformers:
              X_b = transformer.X_transform(X_b)	
            # Run training op.
            feed_dict = self.construct_feed_dict(X_b, y_b, w_b, ids_b)
            fetches = self.train_graph.output + [
                train_op, self.train_graph.loss]
            fetched_values = sess.run(fetches, feed_dict=feed_dict)
            output = fetched_values[:len(self.train_graph.output)]
            loss = fetched_values[-1]
            avg_loss += loss
            y_pred = np.squeeze(np.array(output))
            y_b = y_b.flatten()
            n_batches += 1
          saver.save(sess, self._save_path, global_step=epoch)
          avg_loss = float(avg_loss)/n_batches
          log('Ending epoch %d: Average loss %g' % (epoch, avg_loss), self.verbose)
        # Always save a final checkpoint when complete.
        saver.save(sess, self._save_path, global_step=epoch+1)
    ############################################################## TIMING
    time2 = time.time()
    print("TIMING: model fitting took %0.3f s" % (time2-time1),
          self.verbose)
    ############################################################## TIMING
+29 −0
Original line number Diff line number Diff line
@@ -167,6 +167,35 @@ class TestOverfit(test_util.TensorFlowTestCase):
    scores = model.evaluate(dataset, [classification_metric])
    assert scores[classification_metric.name] > .9

  def test_tf_fittransform_regression_overfit(self):
    """Test that TensorFlow FitTransform models can overfit simple regression datasets."""
    n_samples = 10
    n_features = 3
    n_tasks = 1
    
    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features, n_features)
    y = np.zeros((n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    dataset = dc.data.NumpyDataset(X, y, w, ids)

    fit_transformers = [dc.trans.CoulombFitTransformer(dataset)]
    regression_metric = dc.metrics.Metric(dc.metrics.mean_squared_error)
    model = dc.models.TensorflowMultiTaskFitTransformRegressor(
        n_tasks, [n_features, n_features], dropouts=[0.],
        learning_rate=0.003, weight_init_stddevs=[np.sqrt(6)/np.sqrt(1000)],
        batch_size=n_samples, fit_transformers=fit_transformers, n_evals=1)

    # Fit trained model
    model.fit(dataset, nb_epoch=100)
    model.save()

    # Eval model on train
    scores = model.evaluate(dataset, [regression_metric])
    assert scores[regression_metric.name] < .1

  def test_tf_skewed_classification_overfit(self):
    """Test tensorflow models can overfit 0/1 datasets with few actives."""
    #n_samples = 100