Commit 557bfba0 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Cleanup

parent a2658df2
Loading
Loading
Loading
Loading
+0 −5
Original line number Diff line number Diff line
@@ -8,12 +8,7 @@ from __future__ import unicode_literals
from deepchem.models.models import Model
from deepchem.models.sklearn_models import SklearnModel
from deepchem.models.xgboost_models import XGBoostModel
from deepchem.models.tf_new_models.multitask_classifier import MultitaskGraphClassifier
from deepchem.models.tf_new_models.multitask_regressor import MultitaskGraphRegressor, DTNNMultitaskGraphRegressor

from deepchem.models.tf_new_models.support_classifier import SupportGraphClassifier
from deepchem.models.multitask import SingletaskToMultitask
from deepchem.models.sequential import Sequential

from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskRegressor
from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskClassifier
+0 −321
Original line number Diff line number Diff line
@@ -18,324 +18,3 @@ import tempfile
import numpy as np
import tensorflow as tf
from deepchem.models.models import Model
from deepchem.nn import model_ops
from deepchem.nn.copy import Layer
from deepchem.nn.copy import InputLayer


class Sequential(Model):
  """Linear stack of layers.

  Parameters
  ----------
  layers: list of layers to add to the model.

  Note
  ----
  The first layer passed to a Sequential model
  should have a defined input shape. What that
  means is that it should have received an `input_shape`
  or `batch_input_shape` argument,
  or for some type of layers (recurrent, Dense...)
  an `input_dim` argument.

  Example
  -------
  >>> import deepchem as dc
  >>> model = dc.models.Sequential()
  >>> # Add features
  >>> model.add_features(dc.nn.Input(shape=(50,)))
  >>> # Add labels
  >>> model.add_labels(dc.nn.Input(shape=(1,)))
  >>> model.add(dc.nn.Dense(32, 50))
  >>> model.add(dc.nn.Dense(64, 32))
  """

  def __init__(self, name=None, logdir=None):
    super(Sequential, self).__init__(self, model_dir=logdir)
    self.layers = []  # stack of layers
    self.outputs = None  # tensors (length 1)

    if not name:
      prefix = 'sequential_'
      name = prefix + str(model_ops.get_uid(prefix))
    self.name = name
    self.graph = tf.Graph()

    config = tf.ConfigProto(allow_soft_placement=True)
    self.session = tf.Session(graph=self.graph, config=config)
    # Path to save checkpoint files
    self._save_path = os.path.join(self.model_dir, 'model.ckpt')

  def add(self, layer):
    """Adds a layer instance on top of the layer stack.

    Parameters
    ----------
    layer: layer instance.
    """
    if not isinstance(layer, Layer):
      raise TypeError("The added layer must be an instance of class Layer. "
                      "Found: " + str(layer))
    with self.graph.as_default():
      if not self.layers:
        raise ValueError("Call add_features() before calling add()")
        # first layer in model: check that it is an input layer

      else:
        self.outputs = layer(self.outputs)

      self.layers.append(layer)

  def add_features(self, layer):
    """Adds an input layer."""
    if self.layers:
      raise ValueError(
          "add_features() has to be called before layers are added.")
    if not isinstance(layer, InputLayer):
      raise ValueError("First layer in sequential model must be InputLayer")
    with self.graph.as_default():
      self.features = layer()[0]
      self.outputs = self.features
      self.layers = [layer]

  def add_labels(self, layer):
    """Adds a layer for labels"""
    with self.graph.as_default():
      self.labels = layer()[0]

  def add_loss(self, loss, inputs=None):
    """Adds a loss to model.

    Parameters
    ----------
    losses: list
    """
    # Add losses to graph
    with self.graph.as_default():
      # Loss for each batch element
      batch_loss = loss(self.outputs, self.labels)
      # Loss should be a float
      self.loss = tf.reduce_sum(batch_loss)

  @property
  def uses_learning_phase(self):
    return self.uses_learning_phase

  def fit(self,
          dataset,
          nb_epoch=10,
          max_checkpoints_to_keep=5,
          log_every_N_batches=50,
          learning_rate=.001,
          batch_size=50,
          checkpoint_interval=10):
    """Trains the model for a fixed number of epochs.

    TODO(rbharath0: This is mostly copied from TensorflowGraphModel. Should
    eventually refactor both together.

    Parameters
    ----------
    dataset: dc.data.Dataset
    nb_epoch: 10
      Number of training epochs.
      Dataset object holding training data
        batch_size: integer. Number of samples per gradient update.
        nb_epoch: integer, the number of epochs to train the model.
        verbose: 0 for no logging to stdout,
            1 for progress bar logging, 2 for one log line per epoch.
        initial_epoch: epoch at which to start training
            (useful for resuming a previous training run)
    checkpoint_interval: int
      Frequency at which to write checkpoints, measured in epochs
    """
    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING
    print("Training for %d epochs" % nb_epoch)
    with self.graph.as_default():
      opt = model_ops.optimizer("adam", learning_rate)
      train_op = opt.minimize(self.loss, name='train')
      with self.session as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep)
        # Save an initial checkpoint.
        saver.save(sess, self._save_path, global_step=0)
        for epoch in range(nb_epoch):
          avg_loss, n_batches = 0., 0
          # TODO(rbharath): Don't support example weighting yet.
          for ind, (X_b, y_b, w_b,
                    ids_b) in enumerate(dataset.iterbatches(batch_size)):
            if ind % log_every_N_batches == 0:
              print("On batch %d" % ind)
            feed_dict = {self.features: X_b, self.labels: y_b}
            fetches = [self.outputs] + [train_op, self.loss]
            fetched_values = sess.run(fetches, feed_dict=feed_dict)
            output = fetched_values[:1]
            loss = fetched_values[-1]
            avg_loss += loss
            y_pred = np.squeeze(np.array(output))
            y_b = y_b.flatten()
            n_batches += 1
          if epoch % checkpoint_interval == checkpoint_interval - 1:
            saver.save(sess, self._save_path, global_step=epoch)
          avg_loss = float(avg_loss) / n_batches
          print('Ending epoch %d: Average loss %g' % (epoch, avg_loss))
        # Always save a final checkpoint when complete.
        saver.save(sess, self._save_path, global_step=epoch + 1)
    ############################################################## TIMING
    time2 = time.time()
    print("TIMING: model fitting took %0.3f s" % (time2 - time1))
    ############################################################## TIMING

  def evaluate(self,
               x,
               y,
               batch_size=32,
               verbose=1,
               sample_weight=None,
               **kwargs):
    """Computes the loss on some input data, batch by batch.

    Parameters
    ----------
    x: input data, as a Numpy array or list of Numpy arrays
        (if the model has multiple inputs).
    y: labels, as a Numpy array.
    batch_size: integer. Number of samples per gradient update.
    verbose: verbosity mode, 0 or 1.
    sample_weight: sample weights, as a Numpy array.

    Returns
    -------
    Scalar test loss (if the model has no metrics)
    or list of scalars (if the model computes other metrics).
    The attribute `model.metrics_names` will give you
    the display labels for the scalar outputs.
    """
    if self.model is None:
      raise RuntimeError('The model needs to be compiled ' 'before being used.')
    if 'show_accuracy' in kwargs:
      kwargs.pop('show_accuracy')
      warnings.warn('The "show_accuracy" argument is deprecated, '
                    'instead you should pass the "accuracy" metric to '
                    'the model at compile time:\n'
                    '`model.compile(optimizer, loss, '
                    'metrics=["accuracy"])`')
    if kwargs:
      raise TypeError('Received unknown keyword arguments: ' + str(kwargs))
    return self.model.evaluate(
        x,
        y,
        batch_size=batch_size,
        verbose=verbose,
        sample_weight=sample_weight)

  def predict(self, x, batch_size=32, verbose=0):
    """Generates output predictions for the input samples,
      processing the samples in a batched way.

      # Arguments
          x: the input data, as a Numpy array.
          batch_size: integer.
          verbose: verbosity mode, 0 or 1.

      # Returns
          A Numpy array of predictions.
      """
    if self.model is None:
      self.build()
    return self.model.predict(x, batch_size=batch_size, verbose=verbose)

  def predict_on_batch(self, x):
    """Returns predictions for a single batch of samples.
      """
    if self.model is None:
      self.build()
    return self.model.predict_on_batch(x)

  def train_on_batch(self,
                     x,
                     y,
                     class_weight=None,
                     sample_weight=None,
                     **kwargs):
    """Single gradient update over one batch of samples.

      # Arguments
          x: input data, as a Numpy array or list of Numpy arrays
              (if the model has multiple inputs).
          y: labels, as a Numpy array.
          class_weight: dictionary mapping classes to a weight value,
              used for scaling the loss function (during training only).
          sample_weight: sample weights, as a Numpy array.

      # Returns
          Scalar training loss (if the model has no metrics)
          or list of scalars (if the model computes other metrics).
          The attribute `model.metrics_names` will give you
          the display labels for the scalar outputs.
      """
    if self.model is None:
      raise RuntimeError('The model needs to be compiled ' 'before being used.')
    if 'accuracy' in kwargs:
      kwargs.pop('accuracy')
      warnings.warn('The "accuracy" argument is deprecated, '
                    'instead you should pass the "accuracy" metric to '
                    'the model at compile time:\n'
                    '`model.compile(optimizer, loss, '
                    'metrics=["accuracy"])`')
    if kwargs:
      raise TypeError('Received unknown keyword arguments: ' + str(kwargs))
    return self.model.train_on_batch(
        x, y, sample_weight=sample_weight, class_weight=class_weight)

  def test_on_batch(self, x, y, sample_weight=None, **kwargs):
    """Evaluates the model over a single batch of samples.

      # Arguments
          x: input data, as a Numpy array or list of Numpy arrays
              (if the model has multiple inputs).
          y: labels, as a Numpy array.
          sample_weight: sample weights, as a Numpy array.

      # Returns
          Scalar test loss (if the model has no metrics)
          or list of scalars (if the model computes other metrics).
          The attribute `model.metrics_names` will give you
          the display labels for the scalar outputs.
      """
    if self.model is None:
      raise RuntimeError('The model needs to be compiled ' 'before being used.')
    if 'accuracy' in kwargs:
      kwargs.pop('accuracy')
      warnings.warn('The "accuracy" argument is deprecated, '
                    'instead you should pass the "accuracy" metric to '
                    'the model at compile time:\n'
                    '`model.compile(optimizer, loss, '
                    'metrics=["accuracy"])`')
    if kwargs:
      raise TypeError('Received unknown keyword arguments: ' + str(kwargs))
    return self.model.test_on_batch(x, y, sample_weight=sample_weight)

  def predict_proba(self, x, batch_size=32, verbose=1):
    """Generates class probability predictions for the input samples
      batch by batch.

      # Arguments
          x: input data, as a Numpy array or list of Numpy arrays
              (if the model has multiple inputs).
          batch_size: integer.
          verbose: verbosity mode, 0 or 1.

      # Returns
          A Numpy array of probability predictions.
      """
    preds = self.predict(x, batch_size, verbose)
    if preds.min() < 0. or preds.max() > 1.:
      warnings.warn('Network returning invalid probability values. '
                    'The last layer might not normalize predictions '
                    'into probabilities '
                    '(like softmax or sigmoid would).')
    return preds
+0 −265
Original line number Diff line number Diff line
@@ -19,268 +19,3 @@ import tempfile
from deepchem.data import pad_features
from deepchem.utils.save import log
from deepchem.models import Model
from deepchem.nn.copy import Input
from deepchem.nn.copy import Dense
from deepchem.nn import model_ops
# TODO(rbharath): Find a way to get rid of this import?
from deepchem.models.tf_new_models.graph_topology import merge_dicts


def get_loss_fn(final_loss):
  # Obtain appropriate loss function
  if final_loss == 'L2':

    def loss_fn(x, t):
      diff = tf.subtract(x, t)
      return tf.reduce_sum(tf.square(diff), 0)
  elif final_loss == 'weighted_L2':

    def loss_fn(x, t, w):
      diff = tf.subtract(x, t)
      weighted_diff = tf.multiply(diff, w)
      return tf.reduce_sum(tf.square(weighted_diff), 0)
  elif final_loss == 'L1':

    def loss_fn(x, t):
      diff = tf.subtract(x, t)
      return tf.reduce_sum(tf.abs(diff), 0)
  elif final_loss == 'huber':

    def loss_fn(x, t):
      diff = tf.subtract(x, t)
      return tf.reduce_sum(
          tf.minimum(0.5 * tf.square(diff),
                     huber_d * (tf.abs(diff) - 0.5 * huber_d)), 0)
  elif final_loss == 'cross_entropy':

    def loss_fn(x, t, w):
      costs = tf.nn.sigmoid_cross_entropy_with_logits(logits=x, labels=t)
      weighted_costs = tf.multiply(costs, w)
      return tf.reduce_sum(weighted_costs)
  elif final_loss == 'hinge':

    def loss_fn(x, t, w):
      t = tf.multiply(2.0, t) - 1
      costs = tf.maximum(0.0, 1.0 - tf.multiply(t, x))
      weighted_costs = tf.multiply(costs, w)
      return tf.reduce_sum(weighted_costs)

  return loss_fn


class MultitaskGraphClassifier(Model):

  def __init__(self,
               model,
               n_tasks,
               n_feat,
               logdir=None,
               batch_size=50,
               final_loss='cross_entropy',
               learning_rate=.001,
               optimizer_type="adam",
               learning_rate_decay_time=1000,
               beta1=.9,
               beta2=.999,
               pad_batches=True,
               verbose=True):

    warnings.warn("MultitaskGraphClassifier is deprecated. "
                  "Will be removed in DeepChem 1.4.", DeprecationWarning)
    super(MultitaskGraphClassifier, self).__init__(
        model_dir=logdir, verbose=verbose)
    self.n_tasks = n_tasks
    self.final_loss = final_loss
    self.model = model
    self.sess = tf.Session(graph=self.model.graph)

    with self.model.graph.as_default():
      # Extract model info
      self.batch_size = batch_size
      self.pad_batches = pad_batches
      # Get graph topology for x
      self.graph_topology = self.model.get_graph_topology()
      self.feat_dim = n_feat

      # Raw logit outputs
      self.logits = self.build()
      self.loss_op = self.add_training_loss(self.final_loss, self.logits)
      self.outputs = self.add_softmax(self.logits)

      self.learning_rate = learning_rate
      self.T = learning_rate_decay_time
      self.optimizer_type = optimizer_type

      self.optimizer_beta1 = beta1
      self.optimizer_beta2 = beta2

      # Set epsilon
      self.epsilon = 1e-7
      self.add_optimizer()

      # Initialize
      self.init_fn = tf.global_variables_initializer()
      self.sess.run(self.init_fn)

      # Path to save checkpoint files, which matches the
      # replicated supervisor's default path.
      self._save_path = os.path.join(self.model_dir, 'model.ckpt')

  def build(self):
    # Create target inputs
    self.label_placeholder = tf.placeholder(
        dtype='bool', shape=(None, self.n_tasks), name="label_placeholder")
    self.weight_placeholder = tf.placeholder(
        dtype='float32', shape=(None, self.n_tasks), name="weight_placholder")

    feat = self.model.return_outputs()
    ################################################################ DEBUG
    #print("multitask classifier")
    #print("feat")
    #print(feat)
    ################################################################ DEBUG
    output = model_ops.multitask_logits(feat, self.n_tasks)
    return output

  def add_optimizer(self):
    if self.optimizer_type == "adam":
      self.optimizer = tf.train.AdamOptimizer(
          self.learning_rate,
          beta1=self.optimizer_beta1,
          beta2=self.optimizer_beta2,
          epsilon=self.epsilon)
    else:
      raise ValueError("Optimizer type not recognized.")

    # Get train function
    self.train_op = self.optimizer.minimize(self.loss_op)

  def construct_feed_dict(self, X_b, y_b=None, w_b=None, training=True):
    """Get initial information about task normalization"""
    # TODO(rbharath): I believe this is total amount of data
    n_samples = len(X_b)
    if y_b is None:
      y_b = np.zeros((n_samples, self.n_tasks))
    if w_b is None:
      w_b = np.zeros((n_samples, self.n_tasks))
    targets_dict = {self.label_placeholder: y_b, self.weight_placeholder: w_b}

    # Get graph information
    atoms_dict = self.graph_topology.batch_to_feed_dict(X_b)

    # TODO (hraut->rhbarath): num_datapoints should be a vector, with ith element being
    # the number of labeled data points in target_i. This is to normalize each task
    # num_dat_dict = {self.num_datapoints_placeholder : self.}

    # Get other optimizer information
    # TODO(rbharath): Figure out how to handle phase appropriately
    feed_dict = merge_dicts([targets_dict, atoms_dict])
    return feed_dict

  def add_training_loss(self, final_loss, logits):
    """Computes loss using logits."""
    loss_fn = get_loss_fn(final_loss)  # Get loss function
    task_losses = []
    # label_placeholder of shape (batch_size, n_tasks). Split into n_tasks
    # tensors of shape (batch_size,)
    task_labels = tf.split(
        axis=1, num_or_size_splits=self.n_tasks, value=self.label_placeholder)
    task_weights = tf.split(
        axis=1, num_or_size_splits=self.n_tasks, value=self.weight_placeholder)
    for task in range(self.n_tasks):
      task_label_vector = task_labels[task]
      task_weight_vector = task_weights[task]
      # Convert the labels into one-hot vector encodings.
      one_hot_labels = tf.to_float(
          tf.one_hot(tf.to_int32(tf.squeeze(task_label_vector)), 2))
      # Since we use tf.nn.softmax_cross_entropy_with_logits note that we pass in
      # un-softmaxed logits rather than softmax outputs.
      task_loss = loss_fn(logits[task], one_hot_labels, task_weight_vector)
      task_losses.append(task_loss)
    # It's ok to divide by just the batch_size rather than the number of nonzero
    # examples (effect averages out)
    total_loss = tf.add_n(task_losses)
    total_loss = tf.div(total_loss, self.batch_size)
    return total_loss

  def add_softmax(self, outputs):
    """Replace logits with softmax outputs."""
    softmax = []
    with tf.name_scope('inference'):
      for i, logits in enumerate(outputs):
        softmax.append(tf.nn.softmax(logits, name='softmax_%d' % i))
    return softmax

  def fit(self,
          dataset,
          nb_epoch=10,
          max_checkpoints_to_keep=5,
          log_every_N_batches=50,
          checkpoint_interval=10,
          **kwargs):
    # Perform the optimization
    log("Training for %d epochs" % nb_epoch, self.verbose)

    # TODO(rbharath): Disabling saving for now to try to debug.
    for epoch in range(nb_epoch):
      log("Starting epoch %d" % epoch, self.verbose)
      for batch_num, (X_b, y_b, w_b, ids_b) in enumerate(
          dataset.iterbatches(self.batch_size, pad_batches=self.pad_batches)):
        if batch_num % log_every_N_batches == 0:
          log("On batch %d" % batch_num, self.verbose)
        self.sess.run(
            self.train_op, feed_dict=self.construct_feed_dict(X_b, y_b, w_b))

  def save(self):
    """
    No-op since this model doesn't currently support saving...
    """
    pass

  def predict(self, dataset, transformers=[], **kwargs):
    """Wraps predict to set batch_size/padding."""
    return super(MultitaskGraphClassifier, self).predict(
        dataset, transformers, batch_size=self.batch_size)

  def predict_proba(self, dataset, transformers=[], n_classes=2, **kwargs):
    """Wraps predict_proba to set batch_size/padding."""
    return super(MultitaskGraphClassifier, self).predict_proba(
        dataset, transformers, n_classes=n_classes, batch_size=self.batch_size)

  def predict_on_batch(self, X):
    """Return model output for the provided input.
    """
    if self.pad_batches:
      X = pad_features(self.batch_size, X)
    # run eval data through the model
    n_tasks = self.n_tasks
    with self.sess.as_default():
      feed_dict = self.construct_feed_dict(X)
      # Shape (n_samples, n_tasks)
      batch_outputs = self.sess.run(self.outputs, feed_dict=feed_dict)

    n_samples = len(X)
    outputs = np.zeros((n_samples, self.n_tasks))
    for task, output in enumerate(batch_outputs):
      outputs[:, task] = np.argmax(output, axis=1)
    return outputs

  def predict_proba_on_batch(self, X, n_classes=2):
    """Returns class probabilities on batch"""
    # run eval data through the model
    if self.pad_batches:
      X = pad_features(self.batch_size, X)
    n_tasks = self.n_tasks
    with self.sess.as_default():
      feed_dict = self.construct_feed_dict(X)
      batch_outputs = self.sess.run(self.outputs, feed_dict=feed_dict)

    n_samples = len(X)
    outputs = np.zeros((n_samples, self.n_tasks, n_classes))
    for task, output in enumerate(batch_outputs):
      outputs[:, task, :] = output
    return outputs

  def get_num_tasks(self):
    """Needed to use Model.predict() from superclass."""
    return self.n_tasks
+0 −215

File changed.

Preview size limit exceeded, changes collapsed.

+0 −365

File changed.

Preview size limit exceeded, changes collapsed.

Loading