Commit f30dd679 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #461 from peastman/checkpoint

Added option for checkpoint interval
parents 8cf6fc06 ebc86176
Loading
Loading
Loading
Loading
+6 −2
Original line number Diff line number Diff line
@@ -133,7 +133,8 @@ class Sequential(Model):
          max_checkpoints_to_keep=5,
          log_every_N_batches=50,
          learning_rate=.001,
          batch_size=50):
          batch_size=50,
          checkpoint_interval=10):
    """Trains the model for a fixed number of epochs.

    TODO(rbharath0: This is mostly copied from TensorflowGraphModel. Should
@@ -151,6 +152,8 @@ class Sequential(Model):
            1 for progress bar logging, 2 for one log line per epoch.
        initial_epoch: epoch at which to start training
            (useful for resuming a previous training run)
    checkpoint_interval: int
      Frequency at which to write checkpoints, measured in epochs
    """
    ############################################################## TIMING
    time1 = time.time()
@@ -180,6 +183,7 @@ class Sequential(Model):
            y_pred = np.squeeze(np.array(output))
            y_b = y_b.flatten()
            n_batches += 1
          if epoch % checkpoint_interval == checkpoint_interval - 1:
            saver.save(sess, self._save_path, global_step=epoch)
          avg_loss = float(avg_loss) / n_batches
          print('Ending epoch %d: Average loss %g' % (epoch, avg_loss))
+116 −77
Original line number Diff line number Diff line
@@ -23,6 +23,7 @@ from deepchem.utils.evaluate import Evaluator
from deepchem.data import pad_features
from tensorflow.contrib.layers.python.layers import batch_norm


def softmax(x):
  """Simple numpy softmax implementation
  """
@@ -42,9 +43,12 @@ def softmax(x):
    x /= row_sum.reshape(x.shape[:2] + (1,))
  return x


class TensorflowGraph(object):
  """Simple class that holds information needed to run Tensorflow graph."""
  def __init__(self, graph, session, name_scopes, output, labels, weights, loss):

  def __init__(self, graph, session, name_scopes, output, labels, weights,
               loss):
    self.graph = graph
    self.session = session
    self.name_scopes = name_scopes
@@ -57,7 +61,8 @@ class TensorflowGraph(object):
  def get_placeholder_scope(graph, name_scopes):
    """Gets placeholder scope."""
    placeholder_root = "placeholders"
    return TensorflowGraph.shared_name_scope(placeholder_root, graph, name_scopes)
    return TensorflowGraph.shared_name_scope(placeholder_root, graph,
                                             name_scopes)

  @staticmethod
  def shared_name_scope(name, graph, name_scopes):
@@ -84,6 +89,7 @@ class TensorflowGraph(object):
      feed_dict['{}/{}:0'.format(placeholder_root, name)] = value
    return feed_dict


class TensorflowGraphModel(Model):
  """Parent class for deepchem Tensorflow models.
  
@@ -107,11 +113,25 @@ class TensorflowGraphModel(Model):
    logdir: Directory for output files.
  """

  def __init__(self, n_tasks, n_features, logdir=None, layer_sizes=[1000],
               weight_init_stddevs=[.02], bias_init_consts=[1.], penalty=0.0,
               penalty_type="l2", dropouts=[0.5], learning_rate=.001,
               momentum=.9, optimizer="adam", batch_size=50, n_classes=2,
               pad_batches=False, verbose=True, seed=None, **kwargs):
  def __init__(self,
               n_tasks,
               n_features,
               logdir=None,
               layer_sizes=[1000],
               weight_init_stddevs=[.02],
               bias_init_consts=[1.],
               penalty=0.0,
               penalty_type="l2",
               dropouts=[0.5],
               learning_rate=.001,
               momentum=.9,
               optimizer="adam",
               batch_size=50,
               n_classes=2,
               pad_batches=False,
               verbose=True,
               seed=None,
               **kwargs):
    """Constructs the computational graph.

    This function constructs the computational graph for the model. It relies
@@ -228,7 +248,8 @@ class TensorflowGraphModel(Model):
    else:
      loss = None
      output = self.add_output_ops(graph, output)  # add softmax heads
    return TensorflowGraph(graph=graph,
    return TensorflowGraph(
        graph=graph,
        session=shared_session,
        name_scopes=name_scopes,
        output=output,
@@ -245,8 +266,8 @@ class TensorflowGraphModel(Model):
      with TensorflowGraph.shared_name_scope('costs', graph, name_scopes):
        for task in range(self.n_tasks):
          task_str = str(task).zfill(len(str(self.n_tasks)))
          with TensorflowGraph.shared_name_scope(
              'cost_{}'.format(task_str), graph, name_scopes):
          with TensorflowGraph.shared_name_scope('cost_{}'.format(task_str),
                                                 graph, name_scopes):
            with tf.name_scope('weighted'):
              weighted_cost = self.cost(output[task], labels[task],
                                        weights[task])
@@ -257,12 +278,13 @@ class TensorflowGraphModel(Model):
              # non-zero weight examples in the batch.  Also, instead of using
              # tf.reduce_mean (which can put ops on the CPU) we explicitly
              # calculate with div/sum so it stays on the GPU.
              gradient_cost = tf.div(tf.reduce_sum(weighted_cost),
                                     self.batch_size)
              gradient_cost = tf.div(
                  tf.reduce_sum(weighted_cost), self.batch_size)
              gradient_costs.append(gradient_cost)

        # aggregated costs
        with TensorflowGraph.shared_name_scope('aggregated', graph, name_scopes):
        with TensorflowGraph.shared_name_scope('aggregated', graph,
                                               name_scopes):
          with tf.name_scope('gradient'):
            loss = tf.add_n(gradient_costs)

@@ -273,8 +295,13 @@ class TensorflowGraphModel(Model):

      return loss

  def fit(self, dataset, nb_epoch=10, max_checkpoints_to_keep=5, 
	  log_every_N_batches=50, **kwargs):
  def fit(self,
          dataset,
          nb_epoch=10,
          max_checkpoints_to_keep=5,
          log_every_N_batches=50,
          checkpoint_interval=10,
          **kwargs):
    """Fit the model.

    Parameters
@@ -288,6 +315,8 @@ class TensorflowGraphModel(Model):
    log_every_N_batches: int
      Report every N batches. Useful for training on very large datasets,
      where epochs can take long time to finish.
    checkpoint_interval: int
      Frequency at which to write checkpoints, measured in epochs

    Raises
    ------
@@ -299,8 +328,8 @@ class TensorflowGraphModel(Model):
    ############################################################## TIMING
    log("Training for %d epochs" % nb_epoch, self.verbose)
    with self.train_graph.graph.as_default():
      train_op = self.get_training_op(
          self.train_graph.graph, self.train_graph.loss)
      train_op = self.get_training_op(self.train_graph.graph,
                                      self.train_graph.loss)
      with self._get_shared_session(train=True) as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep)
@@ -312,13 +341,15 @@ class TensorflowGraphModel(Model):
              # Turns out there are valid cases where we don't want pad-batches
              # on by default.
              #dataset.iterbatches(batch_size, pad_batches=True)):
              dataset.iterbatches(self.batch_size, pad_batches=self.pad_batches)):
              dataset.iterbatches(
                  self.batch_size, pad_batches=self.pad_batches)):
            if ind % log_every_N_batches == 0:
              log("On batch %d" % ind, self.verbose)
            # Run training op.
            feed_dict = self.construct_feed_dict(X_b, y_b, w_b, ids_b)
            fetches = self.train_graph.output + [
                train_op, self.train_graph.loss]
                train_op, self.train_graph.loss
            ]
            fetched_values = sess.run(fetches, feed_dict=feed_dict)
            output = fetched_values[:len(self.train_graph.output)]
            loss = fetched_values[-1]
@@ -326,15 +357,16 @@ class TensorflowGraphModel(Model):
            y_pred = np.squeeze(np.array(output))
            y_b = y_b.flatten()
            n_batches += 1
          if epoch % checkpoint_interval == checkpoint_interval - 1:
            saver.save(sess, self._save_path, global_step=epoch)
          avg_loss = float(avg_loss) / n_batches
          log('Ending epoch %d: Average loss %g' % (epoch, avg_loss), self.verbose)
          log('Ending epoch %d: Average loss %g' % (epoch, avg_loss),
              self.verbose)
        # Always save a final checkpoint when complete.
        saver.save(sess, self._save_path, global_step=epoch + 1)
    ############################################################## TIMING
    time2 = time.time()
    print("TIMING: model fitting took %0.3f s" % (time2-time1),
          self.verbose)
    print("TIMING: model fitting took %0.3f s" % (time2 - time1), self.verbose)
    ############################################################## TIMING

  def add_output_ops(self, graph, output):
@@ -365,7 +397,6 @@ class TensorflowGraphModel(Model):
    """
    raise NotImplementedError('Must be overridden by concrete subclass')


  def add_label_placeholders(self, graph, name_scopes):
    """Add Placeholders for labels for each task.

@@ -389,12 +420,14 @@ class TensorflowGraphModel(Model):
    feeding and fetching the same tensor.
    """
    weights = []
    placeholder_scope = TensorflowGraph.get_placeholder_scope(graph, name_scopes)
    placeholder_scope = TensorflowGraph.get_placeholder_scope(graph,
                                                              name_scopes)
    with placeholder_scope:
      for task in range(self.n_tasks):
        weights.append(tf.identity(
            tf.placeholder(tf.float32, shape=[None],
                           name='weights_%d' % task)))
        weights.append(
            tf.identity(
                tf.placeholder(
                    tf.float32, shape=[None], name='weights_%d' % task)))
    return weights

  def cost(self, output, labels, weights):
@@ -421,7 +454,8 @@ class TensorflowGraphModel(Model):
    A training op.
    """
    with graph.as_default():
      opt = model_ops.optimizer(self.optimizer, self.learning_rate, self.momentum)
      opt = model_ops.optimizer(self.optimizer, self.learning_rate,
                                self.momentum)
      return opt.minimize(loss, name='train')

  def _get_shared_session(self, train):
@@ -450,8 +484,7 @@ class TensorflowGraphModel(Model):
      last_checkpoint = self._find_last_checkpoint()
      # TODO(rbharath): Is setting train=False right here?
      saver = tf.train.Saver()
      saver.restore(self._get_shared_session(train=False),
                    last_checkpoint)
      saver.restore(self._get_shared_session(train=False), last_checkpoint)
      self._restored_model = True

  def predict(self, dataset, transformers=[]):
@@ -550,6 +583,7 @@ class TensorflowGraphModel(Model):
          pass
    return os.path.join(self.logdir, last_checkpoint)


class TensorflowClassifier(TensorflowGraphModel):
  """Classification model.

@@ -557,6 +591,7 @@ class TensorflowClassifier(TensorflowGraphModel):
    output: logits op(s) used for computing classification loss and predicted
      class probabilities for each task.
  """

  def get_task_type(self):
    return "classification"

@@ -573,8 +608,8 @@ class TensorflowClassifier(TensorflowGraphModel):
      A tensor with shape batch_size containing the weighted cost for each
      example.
    """
    return tf.mul(tf.nn.softmax_cross_entropy_with_logits(logits, labels),
                  weights)
    return tf.mul(
        tf.nn.softmax_cross_entropy_with_logits(logits, labels), weights)

  def add_label_placeholders(self, graph, name_scopes):
    """Add Placeholders for labels for each task.
@@ -585,15 +620,19 @@ class TensorflowClassifier(TensorflowGraphModel):
    Placeholders are wrapped in identity ops to avoid the error caused by
    feeding and fetching the same tensor.
    """
    placeholder_scope = TensorflowGraph.get_placeholder_scope(graph, name_scopes)
    placeholder_scope = TensorflowGraph.get_placeholder_scope(graph,
                                                              name_scopes)
    with graph.as_default():
      batch_size = self.batch_size
      n_classes = self.n_classes
      labels = []
      with placeholder_scope:
        for task in range(self.n_tasks):
          labels.append(tf.identity(
              tf.placeholder(tf.float32, shape=[None, n_classes],
          labels.append(
              tf.identity(
                  tf.placeholder(
                      tf.float32,
                      shape=[None, n_classes],
                      name='labels_%d' % task)))
      return labels

@@ -639,14 +678,12 @@ class TensorflowClassifier(TensorflowGraphModel):
        elif batch_output.ndim == 2:
          batch_output = batch_output.transpose((1, 0))
        else:
          raise ValueError(
              'Unrecognized rank combination for output: %s' %
          raise ValueError('Unrecognized rank combination for output: %s' %
                           (batch_output.shape,))
        output.append(batch_output)

        outputs = np.array(from_one_hot(
            np.squeeze(np.concatenate(output)), axis=-1))

        outputs = np.array(
            from_one_hot(np.squeeze(np.concatenate(output)), axis=-1))

    outputs = np.copy(outputs)
    outputs = np.reshape(outputs, (len(X), n_tasks))
@@ -689,8 +726,7 @@ class TensorflowClassifier(TensorflowGraphModel):
        elif batch_outputs.ndim == 2:
          batch_outputs = batch_outputs.transpose((1, 0))
        else:
          raise ValueError(
              'Unrecognized rank combination for output: %s ' %
          raise ValueError('Unrecognized rank combination for output: %s ' %
                           (batch_outputs.shape,))

      # Note that softmax is already applied in construct_grpah
@@ -698,6 +734,7 @@ class TensorflowClassifier(TensorflowGraphModel):

    return np.copy(outputs)


class TensorflowRegressor(TensorflowGraphModel):
  """Regression model.

@@ -705,6 +742,7 @@ class TensorflowRegressor(TensorflowGraphModel):
    output: Op(s) used for computing regression loss and predicted regression
      outputs for each task.
  """

  def get_task_type(self):
    return "regressor"

@@ -735,15 +773,17 @@ class TensorflowRegressor(TensorflowGraphModel):
    Placeholders are wrapped in identity ops to avoid the error caused by
    feeding and fetching the same tensor.
    """
    placeholder_scope = TensorflowGraph.get_placeholder_scope(graph, name_scopes)
    placeholder_scope = TensorflowGraph.get_placeholder_scope(graph,
                                                              name_scopes)
    with graph.as_default():
      batch_size = self.batch_size
      labels = []
      with placeholder_scope:
        for task in range(self.n_tasks):
          labels.append(tf.identity(
              tf.placeholder(tf.float32, shape=[None],
                             name='labels_%d' % task)))
          labels.append(
              tf.identity(
                  tf.placeholder(
                      tf.float32, shape=[None], name='labels_%d' % task)))
    return labels

  def predict_on_batch(self, X):
@@ -793,8 +833,7 @@ class TensorflowRegressor(TensorflowGraphModel):
          n_samples = len(X)
          batch_outputs = batch_outputs.reshape((n_samples, n_tasks))
        else:
          raise ValueError(
              'Unrecognized rank combination for output: %s' %
          raise ValueError('Unrecognized rank combination for output: %s' %
                           (batch_outputs.shape))
        # Prune away any padding that was added
        batch_outputs = batch_outputs[:n_samples]
+5 −1
Original line number Diff line number Diff line
@@ -308,6 +308,7 @@ class TensorflowMultiTaskFitTransformRegressor(TensorflowMultiTaskRegressor):
          nb_epoch=10,
          max_checkpoints_to_keep=5,
          log_every_N_batches=50,
          checkpoint_interval=10,
          **kwargs):
    """Perform fit transformations on each minibatch. Fit the model.

@@ -322,6 +323,8 @@ class TensorflowMultiTaskFitTransformRegressor(TensorflowMultiTaskRegressor):
    log_every_N_batches: int
      Report every N batches. Useful for training on very large datasets,
      where epochs can take long time to finish.
    checkpoint_interval: int
      Frequency at which to write checkpoints, measured in epochs

    Raises
    ------
@@ -361,6 +364,7 @@ class TensorflowMultiTaskFitTransformRegressor(TensorflowMultiTaskRegressor):
            y_pred = np.squeeze(np.array(output))
            y_b = y_b.flatten()
            n_batches += 1
          if epoch % checkpoint_interval == checkpoint_interval - 1:
            saver.save(sess, self._save_path, global_step=epoch)
          avg_loss = float(avg_loss) / n_batches
          log('Ending epoch %d: Average loss %g' % (epoch, avg_loss),
+84 −71

File changed.

Preview size limit exceeded, changes collapsed.

+125 −96

File changed.

Preview size limit exceeded, changes collapsed.

Loading