Commit 294805bb authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #377 from joegomes/gdb7_test2

On-the-fly transformation
parents 8b004c53 cdc3ae5c
Loading
Loading
Loading
Loading
+6 −4
Original line number Diff line number Diff line
@@ -263,14 +263,16 @@ Scaffold splitting
|chembl          |MT-NN regression    |Index       |0.443         |0.427         |
|                |MT-NN regression    |Random      |0.464         |0.434         |
|                |MT-NN regression    |Scaffold    |0.484         |0.361         |
|gdb7            |MT-NN regression    |Index       |0.961         |0.011         |
|                |MT-NN regression    |Random      |0.742         |0.732         |
|gdb7            |MT-NN regression    |Index       |0.994         |0.010         |
|                |MT-NN regression    |Random      |0.860         |0.773         |
|                |MT-NN regression    |User-defined|0.996         |0.996         | 
|kaggle          |MT-NN regression    |User-defined|0.748         |0.452         |

|Dataset         |Model               |Splitting   |Train score/MAE(kcal/mol)|Valid score/MAE(kcal/mol)|
|----------------|--------------------|------------|-------------------------|-------------------------|
|gdb7            |MT-NN regression    |Index       |44.5                     |185.6                    |
|                |MT-NN regression    |Random      |86.1                     |92.2                     |
|gdb7            |MT-NN regression    |Index       |18.3                     |172.0                    |
|                |MT-NN regression    |Random      |44.3                     |59.1                     |
|                |MT-NN regression    |User-defined|9.0                      |9.5                      |

* General features

+29 −5
Original line number Diff line number Diff line
@@ -26,24 +26,37 @@ class CoulombMatrix(Featurizer):
  max_atoms : int
      Maximum number of atoms for any molecule in the dataset. Used to
      pad the Coulomb matrix.
  remove_hydrogens : bool, optional (default True)
  remove_hydrogens : bool, optional (default False)
      Whether to remove hydrogens before constructing Coulomb matrix.
  randomize : bool, optional (default True)
  randomize : bool, optional (default False)
      Whether to randomize Coulomb matrices to remove dependence on atom
      index order.
  upper_tri : bool, optional (default False)
      Whether to return the upper triangular portion of the Coulomb matrix.
  n_samples : int, optional (default 1)
      Number of random Coulomb matrices to generate if randomize is True.
  seed : int, optional
      Random seed.

  Example:

  >>> featurizers = dc.feat.CoulombMatrix(max_atoms=23)
  >>> input_file = "input.sdf"
  >>> tasks = ["task0"]
  >>> featurizer = dc.data.SDFLoader(tasks, smiles_field="smiles", mol_field="mol", 
                                      featurizer=featurizers)
  >>> dataset = featurizer.featurize(input_file)

  """
  conformers = True
  name = 'coulomb_matrix'

  def __init__(self, max_atoms, remove_hydrogens=True, randomize=True,
               n_samples=1, seed=None):
  def __init__(self, max_atoms, remove_hydrogens=False, randomize=False,
               upper_tri=False, n_samples=1, seed=None):
    self.max_atoms = int(max_atoms)
    self.remove_hydrogens = remove_hydrogens
    self.randomize = randomize
    self.upper_tri = upper_tri
    self.n_samples = n_samples
    if seed is not None:
      seed = int(seed)
@@ -64,6 +77,7 @@ class CoulombMatrix(Featurizer):
        Molecule.
    """
    features = self.coulomb_matrix(mol)
    if self.upper_tri:
      features = [f[np.triu_indices_from(f)] for f in features]
    features = np.asarray(features)
    return features
@@ -174,6 +188,16 @@ class CoulombMatrixEig(CoulombMatrix):
      Number of random Coulomb matrices to generate if randomize is True.
  seed : int, optional
      Random seed.

  Example:

  >>> featurizers = dc.feat.CoulombMatrixEig(max_atoms=23)
  >>> input_file = "input.sdf"
  >>> tasks = ["task0"]
  >>> featurizer = dc.data.SDFLoader(tasks, smiles_field="smiles", mol_field="mol", 
                                      featurizer=featurizers)
  >>> dataset = featurizer.featurize(input_file)

  """

  conformers = True
+22 −5
Original line number Diff line number Diff line
@@ -30,14 +30,31 @@ class TestCoulombMatrix(unittest.TestCase):
        """
        f = cm.CoulombMatrix(self.mol.GetNumAtoms())
        rval = f([self.mol])
        size = np.triu_indices(self.mol.GetNumAtoms())[0].size
        assert rval.shape == (1, self.mol.GetNumConformers(), size)
        assert rval.shape == (1, self.mol.GetNumConformers(), self.mol.GetNumAtoms(), self.mol.GetNumAtoms())

    def test_coulomb_matrix_padding(self):
        """
        Test CoulombMatrix with padding.
        """
        f = cm.CoulombMatrix(max_atoms=self.mol.GetNumAtoms() * 2)
        max_atoms = self.mol.GetNumAtoms() * 2
        f = cm.CoulombMatrix(max_atoms=max_atoms)
        rval = f([self.mol])
        assert rval.shape == (1, self.mol.GetNumConformers(), max_atoms, max_atoms)

    def test_upper_tri_coulomb_matrix(self):
        """
        Test upper triangular CoulombMatrix.
        """
        f = cm.CoulombMatrix(self.mol.GetNumAtoms(), upper_tri=True)
        rval = f([self.mol])
        size = np.triu_indices(self.mol.GetNumAtoms())[0].size
        assert rval.shape == (1, self.mol.GetNumConformers(), size)

    def test_upper_tri_coulomb_matrix_padding(self):
        """
        Test upper triangular CoulombMatrix with padding.
        """
        f = cm.CoulombMatrix(max_atoms=self.mol.GetNumAtoms() * 2, upper_tri=True)
        rval = f([self.mol])
        size = np.triu_indices(self.mol.GetNumAtoms() * 2)[0].size
        assert rval.shape == (1, self.mol.GetNumConformers(), size)
@@ -49,7 +66,7 @@ class TestCoulombMatrix(unittest.TestCase):
        mol = Chem.RemoveHs(self.mol)
        assert mol.GetNumAtoms() < self.mol.GetNumAtoms()
        f = cm.CoulombMatrix(max_atoms=mol.GetNumAtoms(),
                             remove_hydrogens=True)
                             remove_hydrogens=True, upper_tri=True)
        rval = f([self.mol])  # use the version with hydrogens
        size = np.triu_indices(mol.GetNumAtoms())[0].size
        assert rval.shape == (1, mol.GetNumConformers(), size)
@@ -59,7 +76,7 @@ class TestCoulombMatrix(unittest.TestCase):
        Test no hydrogen removal.
        """
        f = cm.CoulombMatrix(max_atoms=self.mol.GetNumAtoms(),
                             remove_hydrogens=False)
                             remove_hydrogens=False, upper_tri=True)
        rval = f([self.mol])
        size = np.triu_indices(self.mol.GetNumAtoms())[0].size
        assert rval.shape == (1, self.mol.GetNumConformers(), size)
+1 −0
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@ from deepchem.models.multitask import SingletaskToMultitask

from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskRegressor
from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskClassifier
from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskFitTransformRegressor
from deepchem.models.tensorflow_models.robust_multitask import RobustMultitaskRegressor
from deepchem.models.tensorflow_models.robust_multitask import RobustMultitaskClassifier
from deepchem.models.tensorflow_models.lr import TensorflowLogisticRegression
+234 −0
Original line number Diff line number Diff line
@@ -10,7 +10,9 @@ import tensorflow as tf

from deepchem.nn import model_ops
from deepchem.metrics import from_one_hot
from deepchem.utils.save import log
from deepchem.models.tensorflow_models import TensorflowGraph
from deepchem.models.tensorflow_models import TensorflowGraphModel
from deepchem.models.tensorflow_models import TensorflowClassifier
from deepchem.models.tensorflow_models import TensorflowRegressor
from deepchem.metrics import to_one_hot
@@ -186,3 +188,235 @@ class TensorflowMultiTaskRegressor(TensorflowRegressor):
        orig_dict["weights_%d" % task] = np.ones(
            (self.batch_size,)) 
    return TensorflowGraph.get_feed_dict(orig_dict)

class TensorflowMultiTaskFitTransformRegressor(TensorflowMultiTaskRegressor):
  """Implements a TensorflowMultiTaskRegressor that performs on-the-fly transformation during fit/predict

  Example:

  >>> n_samples = 10
  >>> n_features = 3
  >>> n_tasks = 1
  >>> ids = np.arange(n_samples)
  >>> X = np.random.rand(n_samples, n_features, n_features)
  >>> y = np.zeros((n_samples, n_tasks))
  >>> w = np.ones((n_samples, n_tasks))
  >>> dataset = dc.data.NumpyDataset(X, y, w, ids)
  >>> fit_transformers = [dc.trans.CoulombFitTransformer(dataset)]
  >>> model = dc.models.TensorflowMultiTaskFitTransformRegressor(
  >>>     n_tasks, [n_features, n_features], dropouts=[0.],
  >>>     learning_rate=0.003, weight_init_stddevs=[np.sqrt(6)/np.sqrt(1000)],
  >>>     batch_size=n_samples, fit_transformers=fit_transformers, n_evals=1)

  """

  def __init__(self, n_tasks, n_features, logdir=None, layer_sizes=[1000],
               weight_init_stddevs=[.02], bias_init_consts=[1.], penalty=0.0,
               penalty_type="l2", dropouts=[0.5], learning_rate=0.002,
               momentum=.8, optimizer="adam", batch_size=50,
               fit_transformers=[], n_evals=1, verbose=True, seed=None, **kwargs):

    """Initialize TensorflowMultiTaskFitTransformRegressor
       
    Parameters
    ----------
    n_tasks: int
      Number of tasks
    n_features: int
      Number of features.
    logdir: str
      Location to save data
    layer_sizes: list
      List of layer sizes.
    weight_init_stddevs: list
      List of standard deviations for weights (sampled from zero-mean
      gaussians). One for each layer.
    bias_init_consts: list
      List of bias initializations. One for each layer.
    penalty: float
      Amount of penalty (l2 or l1 applied)
    penalty_type: str
      Either "l2" or "l1"
    dropouts: list
      List of dropout amounts. One for each layer.
    learning_rate: float
      Learning rate for model.
    momentum: float
      Momentum. Only applied if optimizer=="momentum"
    optimizer: str
      Type of optimizer applied.
    batch_size: int
      Size of minibatches for training.
    fit_transformers: list
      List of dc.trans.FitTransformer objects
    n_evals: int
      Number of evalations per example at predict time
    verbose: True 
      Perform logging.
    seed: int
      If not none, is used as random seed for tensorflow.        

    """

    self.fit_transformers = fit_transformers
    self.n_evals = n_evals

    # Run fit transformers on dummy dataset to determine n_features after transformation
    if isinstance(n_features, list):
      X_b = np.ones([batch_size]+n_features)
    elif isinstance(n_features, int):
      X_b = np.ones([batch_size, n_features])
    else:
      raise ValueError("n_features should be list or int")

    for transformer in self.fit_transformers:
      X_b = transformer.X_transform(X_b)
    n_features = X_b.shape[1]
    print("n_features after fit_transform: %d" % int(n_features))

    TensorflowGraphModel.__init__(self, n_tasks, n_features, logdir=logdir, 
	       layer_sizes=layer_sizes, weight_init_stddevs=weight_init_stddevs, 
	       bias_init_consts=bias_init_consts, penalty=penalty, 
	       penalty_type=penalty_type, dropouts=dropouts, 
	       learning_rate=learning_rate, momentum=momentum, optimizer=optimizer, 
	       batch_size=batch_size, pad_batches=False, verbose=verbose, seed=seed, 
	       **kwargs)

  def fit(self, dataset, nb_epoch=10, max_checkpoints_to_keep=5, log_every_N_batches=50, **kwargs):
    """Perform fit transformations on each minibatch. Fit the model.

    Parameters
    ---------- 
    dataset: dc.data.Dataset
      Dataset object holding training data 
    nb_epoch: 10
      Number of training epochs.
    max_checkpoints_to_keep: int
      Maximum number of checkpoints to keep; older checkpoints will be deleted.
    log_every_N_batches: int
      Report every N batches. Useful for training on very large datasets,
      where epochs can take long time to finish.

    Raises
    ------
    AssertionError
      If model is not in training mode.
    """
    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING
    log("Training for %d epochs" % nb_epoch, self.verbose)
    with self.train_graph.graph.as_default():
      train_op = self.get_training_op(
          self.train_graph.graph, self.train_graph.loss)
      with self._get_shared_session(train=True) as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep)
        # Save an initial checkpoint.
        saver.save(sess, self._save_path, global_step=0)
        for epoch in range(nb_epoch):
          avg_loss, n_batches = 0., 0
          for ind, (X_b, y_b, w_b, ids_b) in enumerate(
              dataset.iterbatches(self.batch_size, pad_batches=self.pad_batches)):
            if ind % log_every_N_batches == 0:
              log("On batch %d" % ind, self.verbose)
            for transformer in self.fit_transformers:
              X_b = transformer.X_transform(X_b)	
            # Run training op.
            feed_dict = self.construct_feed_dict(X_b, y_b, w_b, ids_b)
            fetches = self.train_graph.output + [
                train_op, self.train_graph.loss]
            fetched_values = sess.run(fetches, feed_dict=feed_dict)
            output = fetched_values[:len(self.train_graph.output)]
            loss = fetched_values[-1]
            avg_loss += loss
            y_pred = np.squeeze(np.array(output))
            y_b = y_b.flatten()
            n_batches += 1
          saver.save(sess, self._save_path, global_step=epoch)
          avg_loss = float(avg_loss)/n_batches
          log('Ending epoch %d: Average loss %g' % (epoch, avg_loss), self.verbose)
        # Always save a final checkpoint when complete.
        saver.save(sess, self._save_path, global_step=epoch+1)
    ############################################################## TIMING
    time2 = time.time()
    print("TIMING: model fitting took %0.3f s" % (time2-time1),
          self.verbose)
    ############################################################## TIMING

  def predict_on_batch(self, X):
    """Return model output for the provided input. Each example is evaluated
        self.n_evals times.

    Restore(checkpoint) must have previously been called on this object.

    Args:
      dataset: dc.data.Dataset object.

    Returns:
      Tuple of three numpy arrays with shape n_examples x n_tasks (x ...):
        output: Model outputs.
        labels: True labels.
        weights: Example weights.
      Note that the output and labels arrays may be more than 2D, e.g. for
      classifier models that return class probabilities.

    Raises:
      AssertionError: If model is not in evaluation mode.
      ValueError: If output and labels are not both 3D or both 2D.
    """
    X_evals = []
    for i in range(self.n_evals):
      X_t = X
      for transformer in self.fit_transformers:
        X_t = transformer.X_transform(X_t)
      X_evals.append(X_t)
    len_unpadded = len(X_t)
    if self.pad_batches:
      for i in range(self.n_evals):
        X_evals[i] = pad_features(self.batch_size, X_evals[i])
    if not self._restored_model:
      self.restore()
    with self.eval_graph.graph.as_default():

      # run eval data through the model
      n_tasks = self.n_tasks
      outputs = []
      with self._get_shared_session(train=False).as_default():

        n_samples = len(X_evals[0])
        for i in range(self.n_evals):

          output = []
          feed_dict = self.construct_feed_dict(X_evals[i])
          data = self._get_shared_session(train=False).run(
              self.eval_graph.output, feed_dict=feed_dict)
          batch_outputs = np.asarray(data[:n_tasks], dtype=float)
          # reshape to batch_size x n_tasks x ...
          if batch_outputs.ndim == 3:
            batch_outputs = batch_outputs.transpose((1, 0, 2))
          elif batch_outputs.ndim == 2:
            batch_outputs = batch_outputs.transpose((1, 0))
          # Handle edge case when batch-size is 1.
          elif batch_outputs.ndim == 1:
            n_samples = len(X)
            batch_outputs = batch_outputs.reshape((n_samples, n_tasks))
          else:
            raise ValueError(
                'Unrecognized rank combination for output: %s' %
                (batch_outputs.shape))
          # Prune away any padding that was added
          batch_outputs = batch_outputs[:n_samples]
          output.append(batch_outputs)

          outputs.append(np.squeeze(np.concatenate(output)))
	  
    outputs = np.mean(np.array(outputs), axis=0)
    outputs = np.copy(outputs)

    # Handle case of 0-dimensional scalar output
    if len(outputs.shape) > 0:
      return outputs[:len_unpadded]
    else:
      outputs = np.reshape(outputs, (1,))
      return outputs
Loading