Commit fa97d2ab authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Merge branch 'master' of https://github.com/deepchem/deepchem into keras_simple

parents d680be68 09225083
Loading
Loading
Loading
Loading
+10 −217
Original line number Diff line number Diff line
@@ -19,11 +19,9 @@ import sklearn

from deepchem.data import Dataset, pad_features
from deepchem.trans import undo_transforms
from deepchem.trans import undo_grad_transforms
from deepchem.utils.save import load_from_disk
from deepchem.utils.save import save_to_disk
from deepchem.utils.save import log
from deepchem.data import pad_batch
from deepchem.utils.evaluate import Evaluator


@@ -32,7 +30,7 @@ class Model(object):
  Abstract base class for different ML models.
  """
  def __init__(self, model_instance=None, model_dir=None,
               fit_transformers=None, verbose=True, **kwargs):
               verbose=True, **kwargs):
    """Abstract class for all models.
    Parameters:
    -----------
@@ -49,7 +47,6 @@ class Model(object):
    self.model_dir = model_dir
    self.model_instance = model_instance
    self.model_class = model_instance.__class__
    self.fit_transformers = fit_transformers

    self.verbose = verbose

@@ -60,7 +57,7 @@ class Model(object):
    raise NotImplementedError(
        "Each model is responsible for its own fit_on_batch method.")

  def predict_on_batch(self, X, pad_batch=False):
  def predict_on_batch(self, X):
    """
    Makes predictions on given batch of new data.

@@ -68,14 +65,11 @@ class Model(object):
    ----------
    X: np.ndarray
      Features
    pad_batch: bool, optional
      Ignored for Sklearn Model. Only used for Tensorflow models
      with rigid batch-size requirements.
    """
    raise NotImplementedError(
        "Each model is responsible for its own predict_on_batch method.")

  def predict_proba_on_batch(self, X, pad_batch=False):
  def predict_proba_on_batch(self, X):
    """
    Makes predictions of class probabilities on given batch of new data.

@@ -83,9 +77,6 @@ class Model(object):
    ----------
    X: np.ndarray
      Features
    pad_batch: bool, optional
      Ignored for Sklearn Model. Only used for Tensorflow models
      with rigid batch-size requirements.
    """
    raise NotImplementedError(
        "Each model is responsible for its own predict_on_batch method.")
@@ -118,7 +109,7 @@ class Model(object):
    """
    raise NotImplementedError

  def fit(self, dataset, nb_epoch=10, batch_size=50, pad_batches=False, **kwargs):
  def fit(self, dataset, nb_epoch=10, batch_size=50, **kwargs):
    """
    Fits a model on data in a Dataset object.
    """
@@ -128,31 +119,12 @@ class Model(object):
      log("Starting epoch %s" % str(epoch+1), self.verbose)
      losses = []
      for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(
          batch_size, pad_batches=pad_batches):
        if self.fit_transformers:
          X_batch, y_batch, w_batch = self.transform_on_batch(X_batch, y_batch,
                                            w_batch)
        if pad_batches:
          X_batch, y_batch, w_batch, ids_batch = pad_batch(
              batch_size, X_batch, y_batch, w_batch, ids_batch)
        
          batch_size):
        losses.append(self.fit_on_batch(X_batch, y_batch, w_batch))
      log("Avg loss for epoch %d: %f"
          % (epoch+1,np.array(losses).mean()),self.verbose)


  def transform_on_batch(self, X, y, w):
    """
    Transforms data in a 1-shard Dataset object with Transformer objects.
    """
    # Transform X, y, and w
    for transformer in self.fit_transformers:
      X, y, w = transformer.transform_on_array(X, y, w)

    return X, y, w

  def predict(self, dataset, transformers=[], batch_size=None,
              pad_batches=False):
  def predict(self, dataset, transformers=[], batch_size=None):
    """
    Uses self to make predictions on provided Dataset object.

@@ -162,10 +134,11 @@ class Model(object):
    y_preds = []
    n_tasks = self.get_num_tasks()
    ind = 0

    for (X_batch, _, _, ids_batch) in dataset.iterbatches(
        batch_size, deterministic=True):
      n_samples = len(X_batch)
      y_pred_batch = self.predict_on_batch(X_batch, pad_batch=pad_batches)
      y_pred_batch = self.predict_on_batch(X_batch)
      # Discard any padded predictions
      y_pred_batch = y_pred_batch[:n_samples]
      y_pred_batch = np.reshape(y_pred_batch, (n_samples, n_tasks))
@@ -204,188 +177,8 @@ class Model(object):
    scores = evaluator.compute_model_performance(metrics)
    return scores

  def predict_grad(self, dataset, transformers=[], batch_size=50):
    """
    Uses self to calculate gradient on provided Dataset object.

    TODO(rbharath): Should we assume each model has meaningful gradients to
    predict? Should this be a subclass for PhysicalModel or the like?

    Returns:
      y_pred: numpy ndarray of shape (n_samples,)
    """
    grads = []
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size):
      energy_batch = self.predict_on_batch(X_batch)
      grad_batch = self.predict_grad_on_batch(X_batch)
      grad_batch = undo_grad_transforms(grad_batch, energy_batch, transformers)
      grads.append(grad_batch)
    grad = np.vstack(grads)
  
    return grad

  def evaluate_error(self, dataset, transformers=[], batch_size=50):
    """
    Evaluate the error in energy and gradient components, forcebalance-style.

    TODO(rbharath): This looks like it should be a subclass method for a
    PhysicalMethod class. forcebalance style errors aren't meaningful for most
    chem-informatic datasets.
    """
    y_preds = []
    y_train = []
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size):

      y_pred_batch = self.predict_on_batch(X_batch)
      y_pred_batch = np.reshape(y_pred_batch, y_batch.shape)

      y_pred_batch = undo_transforms(y_pred_batch, transformers)
      y_preds.append(y_pred_batch)

      y_batch = undo_transforms(y_batch, transformers)
      y_train.append(y_batch)

    y_pred = np.vstack(y_preds)
    y = np.vstack(y_train)

    n_samples, n_tasks = len(dataset), self.get_num_tasks()
    n_atoms = int((n_tasks-1)/3)

    y_pred = np.reshape(y_pred, (n_samples, n_tasks)) 
    y = np.reshape(y, (n_samples, n_tasks))
    grad = y_pred[:,1:]
    grad_train = y[:,1:]

    energy_error = y[:,0]-y_pred[:,0]
    # convert Hartree to kJ/mol
    energy_error = np.sqrt(np.mean(energy_error*energy_error))*2625.5002
 
    grad = np.reshape(grad, (n_samples, n_atoms, 3))
    grad_train = np.reshape(grad_train, (n_samples, n_atoms, 3))    
  
    grad_error = grad-grad_train
    # convert Hartree/bohr to kJ/mol/Angstrom
    grad_error = np.sqrt(np.mean(grad_error*grad_error))*4961.47596096

    print("Energy error (RMSD): %f kJ/mol" % energy_error)
    print("Grad error (RMSD): %f kJ/mol/A" % grad_error)
    
    return energy_error, grad_error

  def evaluate_error_class2(self, dataset, transformers=[], batch_size=50):
    """
    Evaluate the error in energy and gradient components, forcebalance-style.

    TODO(rbharath): Should be a subclass PhysicalModel method. Also, need to
    find a better name for this method (class2 doesn't tell us anything about the
    semantics of this method.
    """
    y_preds = []
    y_train = []
    grads = []
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size):

      # untransformed E is needed for undo_grad_transform
      energy_batch = self.predict_on_batch(X_batch)
      grad_batch = self.predict_grad_on_batch(X_batch)
      grad_batch = undo_grad_transforms(grad_batch, energy_batch, transformers)      
      grads.append(grad_batch)
      y_pred_batch = np.reshape(energy_batch, y_batch.shape)

      # y_pred_batch gives us the pred E and pred multitask trained gradE
      y_pred_batch = undo_transforms(y_pred_batch, transformers)
      y_preds.append(y_pred_batch)

      # undo transforms on y_batch should know how to handle E and gradE separately
      y_batch = undo_transforms(y_batch, transformers)
      y_train.append(y_batch)

    y_pred = np.vstack(y_preds)
    y = np.vstack(y_train)
    grad = np.vstack(grads)

    n_samples, n_tasks = len(dataset), self.get_num_tasks()
    n_atoms = int((n_tasks-1)/3)

    y_pred = np.reshape(y_pred, (n_samples, n_tasks)) 
    y = np.reshape(y, (n_samples, n_tasks))
    grad_train = y[:,1:]

    energy_error = y[:,0]-y_pred[:,0]
    energy_error = np.sqrt(np.mean(energy_error*energy_error))*2625.5002
 
    grad = np.reshape(grad, (n_samples, n_atoms, 3))
    grad_train = np.reshape(grad_train, (n_samples, n_atoms, 3))    
  
    grad_error = grad-grad_train
    grad_error = np.sqrt(np.mean(grad_error*grad_error))*4961.47596096

    print("Energy error (RMSD): %f kJ/mol" % energy_error)
    print("Grad error (RMSD): %f kJ/mol/A" % grad_error)
    
    return energy_error, grad_error

  def test_fd_grad(self, dataset, transformers=[], batch_size=50):
    """
    Uses self to calculate finite difference gradient on provided Dataset object.
    Currently only useful if your task is energy and self contains predict_grad_on_batch.

    TODO(rbharath): This shouldn't be a method of the Model class. Perhaps a
    method of PhysicalModel subclass. Leaving it in for time-being while refactoring
    continues.

    Returns:
      y_pred: numpy ndarray of shape (n_samples,)
    """
    y_preds = []
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size):

      for xb in X_batch:

        num_atoms = xb.shape[0]
        coords = 3

        h = 0.001
        fd_batch = []
        # Filling a new batch with displaced geometries
        for i in range(num_atoms):
          for j in range(coords):
            displace = np.zeros((num_atoms, coords))
            displace[i][j] += h/2
            fd_batch.append(xb+displace)
            fd_batch.append(xb-displace)

        fd_batch = np.asarray(fd_batch)
        # Predict energy on displaced geometry batch
        y_pred_batch = self.predict_on_batch(fd_batch)
        energy = y_pred_batch[:,0]
        y_pred_batch = undo_transforms(y_pred_batch, transformers)
        y_pred_batch = y_pred_batch[:,0]
        y_pred_batch = np.reshape(y_pred_batch, (3*num_atoms, 2))

        fd_grads = []
        # Calculate numerical gradient by centered finite difference
        for x in y_pred_batch:
          fd_grads.append((x[0]-x[1])/h)

        fd_grads = np.asarray(fd_grads)
        fd_grads = np.reshape(fd_grads, (num_atoms, coords))

        xb = np.asarray([xb])
        y_pred_batch = self.predict_grad_on_batch(xb)
        y_pred_batch = undo_grad_transforms(energy, y_pred_batch, transformers)
        # Calculate error between symbolic gradient and numerical gradient
        y_pred_batch = y_pred_batch-fd_grads
        #print(y_pred_batch)
        y_preds.append(y_pred_batch)

    y_pred = np.vstack(y_preds)
  
    return y_pred


  def predict_proba(self, dataset, transformers=[], batch_size=None,
                    n_classes=2, pad_batches=False):
                    n_classes=2):
    """
    TODO: Do transformers even make sense here?

@@ -397,7 +190,7 @@ class Model(object):
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(
        batch_size, deterministic=True):
      n_samples = len(X_batch)
      y_pred_batch = self.predict_proba_on_batch(X_batch, pad_batch=pad_batches)
      y_pred_batch = self.predict_proba_on_batch(X_batch)
      y_pred_batch = y_pred_batch[:n_samples]
      y_pred_batch = np.reshape(y_pred_batch, (n_samples, n_tasks, n_classes))
      y_pred_batch = undo_transforms(y_pred_batch, transformers)
+92 −12
Original line number Diff line number Diff line
@@ -17,7 +17,9 @@ from deepchem.models import Model
from deepchem.metrics import from_one_hot
from deepchem.nn import model_ops
from deepchem.models.tensorflow_models import utils as tf_utils
from deepchem.trans import undo_transforms
from deepchem.utils.save import log
from deepchem.utils.evaluate import Evaluator
from deepchem.data import pad_features
from tensorflow.contrib.layers.python.layers import batch_norm

@@ -109,7 +111,7 @@ class TensorflowGraphModel(Model):
               weight_init_stddevs=[.02], bias_init_consts=[1.], penalty=0.0,
               penalty_type="l2", dropouts=[0.5], learning_rate=.001,
               momentum=.9, optimizer="adam", batch_size=50, n_classes=2,
               verbose=True, seed=None, **kwargs):
               pad_batches=False, verbose=True, seed=None, **kwargs):
    """Constructs the computational graph.

    This function constructs the computational graph for the model. It relies
@@ -166,6 +168,7 @@ class TensorflowGraphModel(Model):
    self.optimizer = optimizer
    self.batch_size = batch_size
    self.n_classes = n_classes
    self.pad_batches = pad_batches
    self.verbose= verbose
    self.seed = seed
    
@@ -270,8 +273,8 @@ class TensorflowGraphModel(Model):

      return loss 

  def fit(self, dataset, nb_epoch=10, pad_batches=False, 
          max_checkpoints_to_keep=5, log_every_N_batches=50, **kwargs):
  def fit(self, dataset, nb_epoch=10, max_checkpoints_to_keep=5, 
	  log_every_N_batches=50, **kwargs):
    """Fit the model.

    Parameters
@@ -280,8 +283,6 @@ class TensorflowGraphModel(Model):
      Dataset object holding training data 
    nb_epoch: 10
      Number of training epochs.
    pad_batches: bool
      Whether or not to pad each batch to exactly be of size batch_size.
    max_checkpoints_to_keep: int
      Maximum number of checkpoints to keep; older checkpoints will be deleted.
    log_every_N_batches: int
@@ -311,7 +312,7 @@ class TensorflowGraphModel(Model):
              # Turns out there are valid cases where we don't want pad-batches
              # on by default.
              #dataset.iterbatches(batch_size, pad_batches=True)):
              dataset.iterbatches(self.batch_size, pad_batches=pad_batches)):
              dataset.iterbatches(self.batch_size, pad_batches=self.pad_batches)):
            if ind % log_every_N_batches == 0:
              log("On batch %d" % ind, self.verbose)
            # Run training op.
@@ -453,6 +454,85 @@ class TensorflowGraphModel(Model):
                    last_checkpoint)
      self._restored_model = True

  def predict(self, dataset, transformers=[]):
    """
    Uses self to make predictions on provided Dataset object.

    Returns:
      y_pred: numpy ndarray of shape (n_samples,)
    """
    y_preds = []
    n_tasks = self.get_num_tasks()
    ind = 0

    for (X_batch, _, _, ids_batch) in dataset.iterbatches(
        self.batch_size, deterministic=True):
      n_samples = len(X_batch)
      y_pred_batch = self.predict_on_batch(X_batch)
      # Discard any padded predictions
      y_pred_batch = y_pred_batch[:n_samples]
      y_pred_batch = np.reshape(y_pred_batch, (n_samples, n_tasks))
      y_pred_batch = undo_transforms(y_pred_batch, transformers)
      y_preds.append(y_pred_batch)
    y_pred = np.vstack(y_preds)
  
    # The iterbatches does padding with zero-weight examples on the last batch.
    # Remove padded examples.
    n_samples = len(dataset)
    y_pred = np.reshape(y_pred, (n_samples, n_tasks))
    # Special case to handle singletasks.
    if n_tasks == 1:
      y_pred = np.reshape(y_pred, (n_samples,)) 
    return y_pred

  def predict_proba(self, dataset, transformers=[], n_classes=2):
    """
    TODO: Do transformers even make sense here?

    Returns:
      y_pred: numpy ndarray of shape (n_samples, n_classes*n_tasks)
    """
    y_preds = []
    n_tasks = self.get_num_tasks()

    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(
        self.batch_size, deterministic=True):
      n_samples = len(X_batch)
      y_pred_batch = self.predict_proba_on_batch(X_batch)
      y_pred_batch = y_pred_batch[:n_samples]
      y_pred_batch = np.reshape(y_pred_batch, (n_samples, n_tasks, n_classes))
      y_pred_batch = undo_transforms(y_pred_batch, transformers)
      y_preds.append(y_pred_batch)
    y_pred = np.vstack(y_preds)
    # The iterbatches does padding with zero-weight examples on the last batch.
    # Remove padded examples.
    n_samples = len(dataset)
    y_pred = y_pred[:n_samples]
    y_pred = np.reshape(y_pred, (n_samples, n_tasks, n_classes))
    return y_pred

  def evaluate(self, dataset, metrics, transformers=[]):
    """
    Evaluates the performance of this model on specified dataset.
  
    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset object.
    metric: deepchem.metrics.Metric
      Evaluation metric
    transformers: list
      List of deepchem.transformers.Transformer

    Returns
    -------
    dict
      Maps tasks to scores under metric.
    """
    evaluator = Evaluator(self, dataset, transformers)
    scores = evaluator.compute_model_performance(metrics)
    return scores

  def _find_last_checkpoint(self):
    """Finds last saved checkpoint."""
    highest_num, last_checkpoint = -np.inf, None
@@ -516,7 +596,7 @@ class TensorflowClassifier(TensorflowGraphModel):
                             name='labels_%d' % task)))
      return labels

  def predict_on_batch(self, X, pad_batch=False):
  def predict_on_batch(self, X):
    """Return model output for the provided input.

    Restore(checkpoint) must have previously been called on this object.
@@ -537,7 +617,7 @@ class TensorflowClassifier(TensorflowGraphModel):
      ValueError: If output and labels are not both 3D or both 2D.
    """
    len_unpadded = len(X)
    if pad_batch:
    if self.pad_batches:
      X = pad_features(self.batch_size, X)
    
    if not self._restored_model:
@@ -572,7 +652,7 @@ class TensorflowClassifier(TensorflowGraphModel):
    outputs = outputs[:len_unpadded]
    return outputs

  def predict_proba_on_batch(self, X, pad_batch=False):
  def predict_proba_on_batch(self, X):
    """Return model output for the provided input.

    Restore(checkpoint) must have previously been called on this object.
@@ -590,7 +670,7 @@ class TensorflowClassifier(TensorflowGraphModel):
      AssertionError: If model is not in evaluation mode.
      ValueError: If output and labels are not both 3D or both 2D.
    """
    if pad_batch:
    if self.pad_batches:
      X = pad_features(self.batch_size, X)
    if not self._restored_model:
      self.restore()
@@ -665,7 +745,7 @@ class TensorflowRegressor(TensorflowGraphModel):
                             name='labels_%d' % task)))
    return labels

  def predict_on_batch(self, X, pad_batch=False):
  def predict_on_batch(self, X):
    """Return model output for the provided input.

    Restore(checkpoint) must have previously been called on this object.
@@ -686,7 +766,7 @@ class TensorflowRegressor(TensorflowGraphModel):
      ValueError: If output and labels are not both 3D or both 2D.
    """
    len_unpadded = len(X)
    if pad_batch:
    if self.pad_batches:
      X = pad_features(self.batch_size, X)
    
    if not self._restored_model:
+4 −4
Original line number Diff line number Diff line
@@ -157,8 +157,8 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
            (self.batch_size,)) 
    return TensorflowGraph.get_feed_dict(orig_dict)
  
  def predict_proba_on_batch(self, X, pad_batch=False):
    if pad_batch:
  def predict_proba_on_batch(self, X):
    if self.pad_batches:
      X = pad_features(self.batch_size, X)
    if not self._restored_model:
      self.restore()
@@ -190,9 +190,9 @@ class TensorflowLogisticRegression(TensorflowGraphModel):

    return np.copy(outputs)

  def predict_on_batch(self, X, pad_batch=False):
  def predict_on_batch(self, X):
    
    if pad_batch:
    if self.pad_batches:
      X = pad_features(self.batch_size, X)
    
    if not self._restored_model:
+5 −7
Original line number Diff line number Diff line
@@ -199,8 +199,8 @@ class ProgressiveJointRegressor(TensorflowMultiTaskRegressor):
        name="U_layer_%d_task%d" % (i, task), dtype=tf.float32)
    return tf.matmul(prev_layer, U)

  def fit(self, dataset, nb_epoch=10, pad_batches=False, 
          max_checkpoints_to_keep=5, log_every_N_batches=50, **kwargs):
  def fit(self, dataset, nb_epoch=10, max_checkpoints_to_keep=5, 
	  log_every_N_batches=50, **kwargs):
    """Fit the model.

    Parameters
@@ -209,8 +209,6 @@ class ProgressiveJointRegressor(TensorflowMultiTaskRegressor):
      Dataset object holding training data 
    nb_epoch: 10
      Number of training epochs.
    pad_batches: bool
      Whether or not to pad each batch to exactly be of size batch_size.
    max_checkpoints_to_keep: int
      Maximum number of checkpoints to keep; older checkpoints will be deleted.
    log_every_N_batches: int
@@ -240,7 +238,7 @@ class ProgressiveJointRegressor(TensorflowMultiTaskRegressor):
              # Turns out there are valid cases where we don't want pad-batches
              # on by default.
              #dataset.iterbatches(batch_size, pad_batches=True)):
              dataset.iterbatches(self.batch_size, pad_batches=pad_batches)):
              dataset.iterbatches(self.batch_size, pad_batches=self.pad_batches)):
            if ind % log_every_N_batches == 0:
              log("On batch %d" % ind, self.verbose)
            # Run training op.
@@ -344,7 +342,7 @@ class ProgressiveJointRegressor(TensorflowMultiTaskRegressor):
            (self.batch_size,)) 
    return TensorflowGraph.get_feed_dict(orig_dict)

  def predict_on_batch(self, X, pad_batch=False):
  def predict_on_batch(self, X):
    """Return model output for the provided input.

    Restore(checkpoint) must have previously been called on this object.
@@ -365,7 +363,7 @@ class ProgressiveJointRegressor(TensorflowMultiTaskRegressor):
      ValueError: If output and labels are not both 3D or both 2D.
    """
    len_unpadded = len(X)
    if pad_batch:
    if self.pad_batches:
      X = pad_features(self.batch_size, X)
    
    if not self._restored_model:
+2 −4
Original line number Diff line number Diff line
@@ -523,7 +523,7 @@ class ProgressiveMultitaskRegressor(TensorflowMultiTaskRegressor):
      return self.eval_graph.session


  def fit_task(self, sess, dataset, task, task_train_op, nb_epoch=10, pad_batches=False,
  def fit_task(self, sess, dataset, task, task_train_op, nb_epoch=10,
               log_every_N_batches=50):
    """Fit the model.

@@ -540,8 +540,6 @@ class ProgressiveMultitaskRegressor(TensorflowMultiTaskRegressor):
      The index of the task to train on.
    nb_epoch: 10
      Number of training epochs.
    pad_batches: bool
      Whether or not to pad each batch to exactly be of size batch_size.
    max_checkpoints_to_keep: int
      Maximum number of checkpoints to keep; older checkpoints will be deleted.
    log_every_N_batches: int
@@ -564,7 +562,7 @@ class ProgressiveMultitaskRegressor(TensorflowMultiTaskRegressor):
          # Turns out there are valid cases where we don't want pad-batches
          # on by default.
          #dataset.iterbatches(batch_size, pad_batches=True)):
          dataset.iterbatches(self.batch_size, pad_batches=pad_batches)):
          dataset.iterbatches(self.batch_size, pad_batches=self.pad_batches)):
        if ind % log_every_N_batches == 0:
          log("On batch %d" % ind, self.verbose)
        feed_dict = self.construct_task_feed_dict(task, X_b, y_b, w_b, ids_b)
Loading