Commit a462f876 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Batch of changes

parent a6a537b2
Loading
Loading
Loading
Loading
+28 −0
Original line number Diff line number Diff line
@@ -682,6 +682,34 @@ class Dataset(object):
      save_to_disk(ys, os.path.join(self.data_dir, row['y_sums']))
      save_to_disk(yss, os.path.join(self.data_dir, row['y_sum_squares']))

  # TODO(rbharath, joegomes): Have to add more comments on why this function is
  # needed.
  def get_grad_statistics(self):
    """Computes and returns statistics of this dataset"""
    if len(self) == 0:
      return None, None, None, None
    df = self.metadata_df
    y = []
    y_n = []
    for _, row in df.iterrows():
      yy = load_from_disk(os.path.join(self.data_dir, row['y']))
      y.append(yy)
      yn = load_from_disk(os.path.join(self.data_dir, row['y_n']))
      y_n.append(np.array(yn))

    y = np.vstack(y)
    y_n = np.sum(y_n, axis=0)

    energy = y[:,0]
    grad = y[:,1:]
    for i in xrange(energy.size):
      grad[i] *= energy[i]

    ydely_means = np.sum(grad, axis=0)/y_n[1:]

    return grad, ydely_means


def compute_sums_and_nb_sample(tensor, W=None):
  """
  Computes sums, squared sums of tensor along axis 0.
+44 −0
Original line number Diff line number Diff line
"""
Atomic coordinate featurizer.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

__author__ = "Joseph Gomes"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "LGPL v2.1+"

import numpy as np
from deepchem.featurizers import Featurizer

class AtomicCoordinates(Featurizer):
  """
  Nx3 matrix of Cartestian coordinates [Angstrom]
  """
  name = ['atomic_coordinates']

  def _featurize(self, mol):
    """
    Calculate atomic coodinates.

    Parameters
    ----------
    mol : RDKit Mol
          Molecule.
    """

    N = mol.GetNumAtoms()
    coords = np.zeros((N,3))
    # TODO(joegomes, rbharath): Add comment about magic number
    coords_in_bohr = [mol.GetConformer(0).GetAtomPosition(i).__div__(0.52917721092)
                      for i in xrange(N)]

    for atom in xrange(N):
       coords[atom,0] = coords_in_bohr[atom].x
       coords[atom,1] = coords_in_bohr[atom].y
       coords[atom,2] = coords_in_bohr[atom].z

    coords = [coords]
    return coords
+8 −2
Original line number Diff line number Diff line
@@ -103,7 +103,7 @@ class Metric(object):
  """Wrapper class for computing user-defined metrics."""

  def __init__(self, metric, task_averager=None, name=None, threshold=None,
               verbosity=None, mode=None):
               verbosity=None, mode=None, atomicnet=False):
    """
    Args:
      metric: function that takes args y_true, y_pred (in that order) and
@@ -136,6 +136,7 @@ class Metric(object):
        raise ValueError("Must specify mode for new metric.")
    assert mode in ["classification", "regression"]
    self.mode = mode
    self.atomicnet = atomicnet

  def compute_metric(self, y_true, y_pred, w=None, n_classes=2, filter_nans=True):
    """Compute a performance metric for each task.
@@ -181,6 +182,11 @@ class Metric(object):
      if filter_nans:
        computed_metrics = np.array(computed_metrics)
        computed_metrics = computed_metrics[~np.isnan(computed_metrics)]
      if self.atomicnet:
        force_error = self.task_averager(computed_metrics[1:])*4961.47596096
        print("Force error (metric: np.mean(%s)): %f kJ/mol/A" % (self.name, force_error))
        return computed_metrics[0]
      else:
        return self.task_averager(computed_metrics)

  def compute_singletask_metric(self, y_true, y_pred, w):
+193 −1
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@ import tempfile
import sklearn
from deepchem.datasets import Dataset
from deepchem.transformers import undo_transforms
from deepchem.transformers import undo_grad_transforms
from deepchem.utils.save import load_from_disk
from deepchem.utils.save import save_to_disk
from deepchem.utils.save import log
@@ -137,19 +138,210 @@ class Model(object):
    """
    y_preds = []
    batch_size = self.model_params["batch_size"]
    n_tasks = len(self.tasks)
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(
        batch_size, deterministic=True):
      y_pred_batch = np.reshape(self.predict_on_batch(X_batch), y_batch.shape)
      ############################################################## DEBUG
      #y_pred_batch = np.reshape(self.predict_on_batch(X_batch), y_batch.shape)
      n_samples = len(X_batch)
      y_pred_batch = np.reshape(self.predict_on_batch(X_batch), (n_samples, n_tasks))
      #print("predict()")
      #print("X_batch.shape, y_batch.shape")
      #print(X_batch.shape, y_batch.shape)
      #print("y_pred_batch.shape")
      #print(y_pred_batch.shape)
      ############################################################## DEBUG
      y_pred_batch = undo_transforms(y_pred_batch, transformers)
      y_preds.append(y_pred_batch)
    ############################################################## DEBUG
    print("predict()")
    print("[y_pred.shape for y_pred in y_preds]")
    print([y_pred.shape for y_pred in y_preds])
    ############################################################## DEBUG
    y_pred = np.vstack(y_preds)
  
    # The iterbatches does padding with zero-weight examples on the last batch.
    # Remove padded examples.
    n_samples, n_tasks = len(dataset), len(self.tasks)
    y_pred = np.reshape(y_pred, (n_samples, n_tasks))
    ############################################################## DEBUG
    # Special case to handle singletasks.
    if n_tasks == 1:
      y_pred = np.squeeze(y_pred)
    print("n_tasks, y_pred.shape")
    print(n_tasks, y_pred.shape)
    ############################################################## DEBUG
    return y_pred

  def predict_grad(self, dataset, transformers=[]):
    """
    Uses self to calculate gradient on provided Dataset object.

    Returns:
      y_pred: numpy ndarray of shape (n_samples,)
    """
    grads = []
    batch_size = self.model_params["batch_size"]
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size):
      energy_batch = self.predict_on_batch(X_batch)
      grad_batch = self.predict_grad_on_batch(X_batch)
      grad_batch = undo_grad_transforms(grad_batch, energy_batch, transformers)
      grads.append(grad_batch)
    grad = np.vstack(grads)
  
    return grad

  def evaluate_error(self, dataset, transformers=[]):
    """
    Evaluate the error in energy and gradient components, forcebalance-style.
    """
    y_preds = []
    y_train = []
    batch_size = self.model_params["batch_size"]
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size):

      y_pred_batch = self.predict_on_batch(X_batch)
      y_pred_batch = np.reshape(y_pred_batch, y_batch.shape)

      y_pred_batch = undo_transforms(y_pred_batch, transformers)
      y_preds.append(y_pred_batch)

      y_batch = undo_transforms(y_batch, transformers)
      y_train.append(y_batch)

    y_pred = np.vstack(y_preds)
    y = np.vstack(y_train)

    n_samples, n_tasks = len(dataset), len(self.tasks)
    n_atoms = int((n_tasks-1)/3)

    y_pred = np.reshape(y_pred, (n_samples, n_tasks)) 
    y = np.reshape(y, (n_samples, n_tasks))
    grad = y_pred[:,1:]
    grad_train = y[:,1:]

    energy_error = y[:,0]-y_pred[:,0]
    # convert Hartree to kJ/mol
    energy_error = np.sqrt(np.mean(energy_error*energy_error))*2625.5002
 
    grad = np.reshape(grad, (n_samples, n_atoms, 3))
    grad_train = np.reshape(grad_train, (n_samples, n_atoms, 3))    
  
    grad_error = grad-grad_train
    # convert Hartree/bohr to kJ/mol/Angstrom
    grad_error = np.sqrt(np.mean(grad_error*grad_error))*4961.47596096

    print("Energy error (RMSD): %f kJ/mol" % energy_error)
    print("Grad error (RMSD): %f kJ/mol/A" % grad_error)
    
    return energy_error, grad_error

  def evaluate_error_class2(self, dataset, transformers=[]):
    """
    Evaluate the error in energy and gradient components, forcebalance-style.
    """
    y_preds = []
    y_train = []
    grads = []
    batch_size = self.model_params["batch_size"]
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size):

      # untransformed E is needed for undo_grad_transform
      energy_batch = self.predict_on_batch(X_batch)
      grad_batch = self.predict_grad_on_batch(X_batch)
      grad_batch = undo_grad_transforms(grad_batch, energy_batch, transformers)      
      grads.append(grad_batch)
      y_pred_batch = np.reshape(energy_batch, y_batch.shape)

      # y_pred_batch gives us the pred E and pred multitask trained gradE
      y_pred_batch = undo_transforms(y_pred_batch, transformers)
      y_preds.append(y_pred_batch)

      # undo transforms on y_batch should know how to handle E and gradE separately
      y_batch = undo_transforms(y_batch, transformers)
      y_train.append(y_batch)

    y_pred = np.vstack(y_preds)
    y = np.vstack(y_train)
    grad = np.vstack(grads)

    n_samples, n_tasks = len(dataset), len(self.tasks)
    n_atoms = int((n_tasks-1)/3)

    y_pred = np.reshape(y_pred, (n_samples, n_tasks)) 
    y = np.reshape(y, (n_samples, n_tasks))
    grad_train = y[:,1:]

    energy_error = y[:,0]-y_pred[:,0]
    energy_error = np.sqrt(np.mean(energy_error*energy_error))*2625.5002
 
    grad = np.reshape(grad, (n_samples, n_atoms, 3))
    grad_train = np.reshape(grad_train, (n_samples, n_atoms, 3))    
  
    grad_error = grad-grad_train
    grad_error = np.sqrt(np.mean(grad_error*grad_error))*4961.47596096

    print("Energy error (RMSD): %f kJ/mol" % energy_error)
    print("Grad error (RMSD): %f kJ/mol/A" % grad_error)
    
    return energy_error, grad_error

  def test_fd_grad(self, dataset, transformers=[]):
    """
    Uses self to calculate finite difference gradient on provided Dataset object.
    Currently only useful if your task is energy and self contains predict_grad_on_batch.

    Returns:
      y_pred: numpy ndarray of shape (n_samples,)
    """
    y_preds = []
    batch_size = self.model_params["batch_size"]
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size):

      for xb in X_batch:

        num_atoms = xb.shape[0]
        coords = 3

        h = 0.001
        fd_batch = []
        # Filling a new batch with displaced geometries
        for i in xrange(num_atoms):
          for j in xrange(coords):
            displace = np.zeros((num_atoms, coords))
            displace[i][j] += h/2
            fd_batch.append(xb+displace)
            fd_batch.append(xb-displace)

        fd_batch = np.asarray(fd_batch)
        # Predict energy on displaced geometry batch
        y_pred_batch = self.predict_on_batch(fd_batch)
        energy = y_pred_batch[:,0]
        y_pred_batch = undo_transforms(y_pred_batch, transformers)
        y_pred_batch = y_pred_batch[:,0]
        y_pred_batch = np.reshape(y_pred_batch, (3*num_atoms, 2))

        fd_grads = []
        # Calculate numerical gradient by centered finite difference
        for x in y_pred_batch:
          fd_grads.append((x[0]-x[1])/h)

        fd_grads = np.asarray(fd_grads)
        fd_grads = np.reshape(fd_grads, (num_atoms, coords))

        xb = np.asarray([xb])
        y_pred_batch = self.predict_grad_on_batch(xb)
        y_pred_batch = undo_grad_transforms(energy, y_pred_batch, transformers)
        # Calculate error between symbolic gradient and numerical gradient
        y_pred_batch = y_pred_batch-fd_grads
        #print(y_pred_batch)
        y_preds.append(y_pred_batch)

    y_pred = np.vstack(y_preds)
  
    return y_pred


  def predict_proba(self, dataset, transformers=[], n_classes=2):
    """
    TODO: Do transformers even make sense here?
+47 −5
Original line number Diff line number Diff line
@@ -10,6 +10,9 @@ import numpy as np
from deepchem.utils.save import log
from deepchem.models import Model
import sklearn
######################################################## DEBUG
from deepchem.transformers import undo_transforms
######################################################## DEBUG

class SingletaskToMultitask(Model):
  """
@@ -99,12 +102,30 @@ class SingletaskToMultitask(Model):
            verbosity=self.verbosity)
        task_model.reload()

      if task_type == "classification":
        y_pred[:, ind] = task_model.predict_on_batch(X)
      elif task_type == "regression":
      y_pred[:, ind] = task_model.predict_on_batch(X)
    return y_pred

  def predict(self, dataset, transformers=[]):
    """
    Prediction for multitask models. 
    """
    n_tasks = len(self.tasks)
    n_samples = len(dataset) 
    y_pred = np.zeros((n_samples, n_tasks))
    for ind, task in enumerate(self.tasks):
      task_type = self.task_types[task]
      if self.store_in_memory:
        task_model = self.task_models[task]
      else:
        raise ValueError("Invalid task_type")
        task_model = self.model_builder(
            [task], {task: self.task_types[task]}, self.model_params,
            self.task_model_dirs[task],
            verbosity=self.verbosity)
        task_model.reload()

      #y_pred[:, ind] = task_model.predict(dataset, transformers)
      y_pred[:, ind] = task_model.predict(dataset, [])
    y_pred = undo_transforms(y_pred, transformers)
    return y_pred

  def predict_proba_on_batch(self, X, n_classes=2):
@@ -127,6 +148,27 @@ class SingletaskToMultitask(Model):
      y_pred[:, ind] = task_model.predict_proba_on_batch(X)
    return y_pred

  def predict_proba(self, dataset, transformers=[], n_classes=2):
    """
    Concatenates results from all singletask models.
    """
    n_tasks = len(self.tasks)
    n_samples = X.shape[0]
    y_pred = np.zeros((n_samples, n_tasks, n_classes))
    for ind, task in enumerate(self.tasks):
      if self.store_in_memory:
        task_model = self.task_models[task]
      else:
        task_model = self.model_builder(
            [task], {task: self.task_types[task]}, self.model_params,
            self.task_model_dirs[task],
            verbosity=self.verbosity)
        task_model.reload()

      y_pred[:, ind] = task_model.predict_proba_on_batch(
          dataset, transformers, n_classes)
    return y_pred

  def save(self):
    """Save all models"""
    # Saving is done on-the-fly
Loading