Unverified Commit 766139be authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #949 from peastman/examples

Cleanup of examples plus bug fixes
parents f4fb1f00 c7315e7a
Loading
Loading
Loading
Loading
+7 −8
Original line number Diff line number Diff line
@@ -22,7 +22,7 @@ from deepchem.models.tensorflow_models import TensorflowRegressor
from deepchem.metrics import to_one_hot

from deepchem.models.tensorgraph.tensor_graph import TensorGraph, TFWrapper
from deepchem.models.tensorgraph.layers import Feature, Label, Weights, WeightedError, Dense, Dropout, WeightDecay, Reshape, SoftMaxCrossEntropy, L2Loss
from deepchem.models.tensorgraph.layers import Feature, Label, Weights, WeightedError, Dense, Dropout, WeightDecay, Reshape, SoftMaxCrossEntropy, L2Loss, ReduceSum


class TensorGraphMultiTaskClassifier(TensorGraph):
@@ -280,8 +280,7 @@ class TensorGraphMultiTaskRegressor(TensorGraph):
    self.add_output(output)
    labels = Label(shape=(None, n_tasks, 1))
    weights = Weights(shape=(None, n_tasks))
    loss = L2Loss(in_layers=[labels, output])
    weighted_loss = WeightedError(in_layers=[loss, weights])
    weighted_loss = ReduceSum(L2Loss(in_layers=[labels, output, weights]))
    if weight_decay_penalty != 0.0:
      weighted_loss = WeightDecay(
          weight_decay_penalty,
@@ -500,8 +499,8 @@ class TensorflowMultiTaskClassifier(TensorflowClassifier):
        orig_dict["labels_%d" % task] = to_one_hot(y_b[:, task])
      else:
        # Dummy placeholders
        orig_dict["labels_%d" %
                  task] = np.squeeze(to_one_hot(np.zeros((self.batch_size,))))
        orig_dict["labels_%d" % task] = np.squeeze(
            to_one_hot(np.zeros((self.batch_size,))))
      if w_b is not None:
        orig_dict["weights_%d" % task] = w_b[:, task]
      else:
@@ -588,8 +587,8 @@ class TensorflowMultiTaskRegressor(TensorflowRegressor):
                    weight_init=tf.truncated_normal(
                        shape=[prev_layer_size, 1],
                        stddev=weight_init_stddevs[i]),
                    bias_init=tf.constant(value=bias_init_consts[i], shape=[1
                                                                           ]))))
                    bias_init=tf.constant(value=bias_init_consts[i],
                                          shape=[1]))))
    return (output, labels, weights)

  def construct_feed_dict(self, X_b, y_b=None, w_b=None, ids_b=None):
+43 −29
Original line number Diff line number Diff line
@@ -425,8 +425,8 @@ class Conv1D(Layer):
      raise ValueError("Parent tensor must be (batch, width, channel)")
    parent_shape = parent.get_shape()
    parent_channel_size = parent_shape[2].value
    f = tf.Variable(self.weights_initializer()
                    ([self.width, parent_channel_size, self.out_channels]))
    f = tf.Variable(self.weights_initializer()(
        [self.width, parent_channel_size, self.out_channels]))
    t = tf.nn.conv1d(parent, f, stride=self.stride, padding=self.padding)
    if self.biases_initializer is not None:
      b = tf.Variable(self.biases_initializer()([self.out_channels]))
@@ -947,6 +947,12 @@ class Weights(Input):


class L1Loss(Layer):
  """Compute the mean absolute difference between the elements of the inputs.

  This layer should have two or three inputs.  If there is a third input, the
  difference between the first two inputs is multiplied by the third one to
  produce a weighted error.
  """

  def __init__(self, in_layers=None, **kwargs):
    super(L1Loss, self).__init__(in_layers, **kwargs)
@@ -954,14 +960,22 @@ class L1Loss(Layer):
  def create_tensor(self, in_layers=None, set_tensors=True, **kwargs):
    inputs = self._get_input_tensors(in_layers, True)
    guess, label = inputs[0], inputs[1]
    out_tensor = tf.reduce_mean(
        tf.abs(guess - label), axis=list(range(1, len(label.shape))))
    l1 = tf.abs(guess - label)
    if len(inputs) > 2:
      l1 *= inputs[2]
    out_tensor = tf.reduce_mean(l1, axis=list(range(1, len(label.shape))))
    if set_tensors:
      self.out_tensor = out_tensor
    return out_tensor


class L2Loss(Layer):
  """Compute the mean squared difference between the elements of the inputs.

  This layer should have two or three inputs.  If there is a third input, the
  squared difference between the first two inputs is multiplied by the third one to
  produce a weighted error.
  """

  def __init__(self, in_layers=None, **kwargs):
    super(L2Loss, self).__init__(in_layers, **kwargs)
@@ -969,17 +983,19 @@ class L2Loss(Layer):
      shape1 = self.in_layers[0].shape
      shape2 = self.in_layers[1].shape
      if shape1[0] is None:
        self._shape = (parent_shape[1],)
        self._shape = (shape2[0],)
      else:
        self._shape = (parent_shape[0],)
        self._shape = (shape1[0],)
    except:
      pass

  def create_tensor(self, in_layers=None, set_tensors=True, **kwargs):
    inputs = self._get_input_tensors(in_layers, True)
    guess, label = inputs[0], inputs[1]
    out_tensor = tf.reduce_mean(
        tf.square(guess - label), axis=list(range(1, len(label._shape))))
    l2 = tf.square(guess - label)
    if len(inputs) > 2:
      l2 *= inputs[2]
    out_tensor = tf.reduce_mean(l2, axis=list(range(1, len(label._shape))))
    if set_tensors:
      self.out_tensor = out_tensor
    return out_tensor
@@ -1315,7 +1331,7 @@ class SparseSoftMaxCrossEntropy(Layer):
  def __init__(self, in_layers=None, **kwargs):
    super(SparseSoftMaxCrossEntropy, self).__init__(in_layers, **kwargs)
    try:
      self._shape = (self.in_layers[1].shape[0], 1)
      self._shape = self.in_layers[1].shape[:-1]
    except:
      pass

@@ -1324,9 +1340,8 @@ class SparseSoftMaxCrossEntropy(Layer):
    if len(inputs) != 2:
      raise ValueError()
    labels, logits = inputs[0], inputs[1]
    self.out_tensor = tf.nn.sparse_softmax_cross_entropy_with_logits(
    out_tensor = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=logits, labels=labels)
    out_tensor = tf.reshape(self.out_tensor, [-1, 1])
    if set_tensors:
      self.out_tensor = out_tensor
    return out_tensor
@@ -1337,7 +1352,7 @@ class SoftMaxCrossEntropy(Layer):
  def __init__(self, in_layers=None, **kwargs):
    super(SoftMaxCrossEntropy, self).__init__(in_layers, **kwargs)
    try:
      self._shape = (self.in_layers[1].shape[0], 1)
      self._shape = self.in_layers[1].shape[:-1]
    except:
      pass

@@ -1346,9 +1361,8 @@ class SoftMaxCrossEntropy(Layer):
    if len(inputs) != 2:
      raise ValueError()
    labels, logits = inputs[0], inputs[1]
    self.out_tensor = tf.nn.softmax_cross_entropy_with_logits(
    out_tensor = tf.nn.softmax_cross_entropy_with_logits(
        logits=logits, labels=labels)
    out_tensor = tf.reshape(self.out_tensor, [-1, 1])
    if set_tensors:
      self.out_tensor = out_tensor
    return out_tensor
@@ -1887,8 +1901,8 @@ class MaxPool1D(Layer):
    super(MaxPool1D, self).__init__(**kwargs)
    try:
      parent_shape = self.in_layers[0].shape
      self._shape = tuple(None if p is None else p // s
                          for p, s in zip(parent_shape, strides))
      self._shape = tuple(
          None if p is None else p // s for p, s in zip(parent_shape, strides))
    except:
      pass

@@ -1919,8 +1933,8 @@ class MaxPool2D(Layer):
    super(MaxPool2D, self).__init__(**kwargs)
    try:
      parent_shape = self.in_layers[0].shape
      self._shape = tuple(None if p is None else p // s
                          for p, s in zip(parent_shape, strides))
      self._shape = tuple(
          None if p is None else p // s for p, s in zip(parent_shape, strides))
    except:
      pass

@@ -1966,8 +1980,8 @@ class MaxPool3D(Layer):
    super(MaxPool3D, self).__init__(**kwargs)
    try:
      parent_shape = self.in_layers[0].shape
      self._shape = tuple(None if p is None else p // s
                          for p, s in zip(parent_shape, strides))
      self._shape = tuple(
          None if p is None else p // s for p, s in zip(parent_shape, strides))
    except:
      pass

@@ -2679,7 +2693,7 @@ class WeightedError(Layer):
    self._shape = tuple()

  def create_tensor(self, in_layers=None, set_tensors=True, **kwargs):
    inputs = self._get_input_tensors(in_layers, True)
    inputs = self._get_input_tensors(in_layers)
    entropy, weights = inputs[0], inputs[1]
    out_tensor = tf.reduce_sum(entropy * weights)
    if set_tensors:
@@ -3118,8 +3132,8 @@ class NeighborList(Layer):
    mesh_args = [tf.range(start, stop, nbr_cutoff) for _ in range(self.ndim)]
    return tf.to_float(
        tf.reshape(
            tf.transpose(tf.stack(tf.meshgrid(*mesh_args))), (self.n_cells,
                                                              self.ndim)))
            tf.transpose(tf.stack(tf.meshgrid(*mesh_args))),
            (self.n_cells, self.ndim)))


class Dropout(Layer):
@@ -3406,8 +3420,8 @@ class AtomicConvolution(Layer):
    example_tensors = tf.unstack(X, axis=0)
    example_nbrs = tf.unstack(nbr_indices, axis=0)
    all_nbr_coords = []
    for example, (example_tensor,
                  example_nbr) in enumerate(zip(example_tensors, example_nbrs)):
    for example, (example_tensor, example_nbr) in enumerate(
        zip(example_tensors, example_nbrs)):
      nbr_coords = tf.gather(example_tensor, example_nbr)
      all_nbr_coords.append(nbr_coords)
    neighbors = tf.stack(all_nbr_coords)
@@ -3973,13 +3987,13 @@ class GraphCNN(Layer):
    no_features = V.get_shape()[2].value
    W = tf.get_variable(
        '%s_weights' % self.name, [no_features * no_A, self.num_filters],
        initializer=tf.truncated_normal_initializer(stddev=math.sqrt(
            1.0 / (no_features * (no_A + 1) * 1.0))),
        initializer=tf.truncated_normal_initializer(
            stddev=math.sqrt(1.0 / (no_features * (no_A + 1) * 1.0))),
        dtype=tf.float32)
    W_I = tf.get_variable(
        '%s_weights_I' % self.name, [no_features, self.num_filters],
        initializer=tf.truncated_normal_initializer(stddev=math.sqrt(
            1.0 / (no_features * (no_A + 1) * 1.0))),
        initializer=tf.truncated_normal_initializer(
            stddev=math.sqrt(1.0 / (no_features * (no_A + 1) * 1.0))),
        dtype=tf.float32)

    b = tf.get_variable(
+1 −1
Original line number Diff line number Diff line
@@ -318,7 +318,7 @@ class TestLayers(test_util.TensorFlowTestCase):
      label_tensor = tf.convert_to_tensor(label_tensor, dtype=tf.float32)
      out_tensor = SoftMaxCrossEntropy()(logit_tensor, label_tensor)
      out_tensor = out_tensor.eval()
      assert out_tensor.shape == (batch_size, 1)
      assert out_tensor.shape == (batch_size,)

  def test_reduce_mean(self):
    """Test that ReduceMean can be invoked."""
+8 −6
Original line number Diff line number Diff line
@@ -60,14 +60,14 @@ def gen_kaggle(KAGGLE_tasks,
                            "KAGGLE_test2_disguised_combined_full.csv.gz")
  if not os.path.exists(train_files):
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/KAGGLE_training_disguised_combined_full.csv.gz'
    )
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/KAGGLE_training_disguised_combined_full.csv.gz',
        dest_dir=data_dir)
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/KAGGLE_test1_disguised_combined_full.csv.gz'
    )
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/KAGGLE_test1_disguised_combined_full.csv.gz',
        dest_dir=data_dir)
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/KAGGLE_test2_disguised_combined_full.csv.gz'
    )
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/KAGGLE_test2_disguised_combined_full.csv.gz',
        dest_dir=data_dir)

  # Featurize KAGGLE dataset
  print("About to featurize KAGGLE dataset.")
@@ -125,6 +125,8 @@ def load_kaggle(shard_size=2000, featurizer=None, split=None, reload=True):
  data_dir = deepchem.utils.get_data_dir()

  data_dir = os.path.join(data_dir, "kaggle")
  if not os.path.exists(data_dir):
    os.mkdir(data_dir)
  train_dir = os.path.join(data_dir, "train_dir")
  valid_dir = os.path.join(data_dir, "valid_dir")
  test_dir = os.path.join(data_dir, "test_dir")
+0 −89
Original line number Diff line number Diff line
"""
ChEMBL dataset loader.
"""
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import os
import sys
import time

import deepchem as dc

sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from chembl_tasks import chembl_tasks


# Set shard size low to avoid memory problems.
def load_chembl(shard_size=2000, featurizer="ECFP", set="5thresh", split="random"):
    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING
    # Set some global variables up top
    current_dir = os.path.dirname(os.path.realpath(__file__))

    # Load dataset
    print("About to load ChEMBL dataset.")
    if split == "year":
        train_datasets, valid_datasets, test_datasets = [], [], []
        train_files = os.path.join(current_dir,
                                   "year_sets/chembl_%s_ts_train.csv.gz" % set)
        valid_files = os.path.join(current_dir,
                                   "year_sets/chembl_%s_ts_valid.csv.gz" % set)
        test_files = os.path.join(current_dir,
                                  "year_sets/chembl_%s_ts_test.csv.gz" % set)
    else:
        dataset_path = os.path.join(
            current_dir, "../../datasets/chembl_%s.csv.gz" % set)

    # Featurize ChEMBL dataset
    print("About to featurize ChEMBL dataset.")
    if featurizer == 'ECFP':
        featurizer = dc.feat.CircularFingerprint(size=1024)
    elif featurizer == 'GraphConv':
        featurizer = dc.feat.ConvMolFeaturizer()

    loader = dc.data.CSVLoader(
        tasks=chembl_tasks, smiles_field="smiles", featurizer=featurizer)

    if split == "year":
        print("Featurizing train datasets")
        train_dataset = loader.featurize(
            train_files, shard_size=shard_size)

        print("Featurizing valid datasets")
        valid_dataset = loader.featurize(
            valid_files, shard_size=shard_size)

        print("Featurizing test datasets")
        test_dataset = loader.featurize(
            test_files, shard_size=shard_size)
    else:
        dataset = loader.featurize(dataset_path, shard_size=shard_size)

    # Initialize transformers
    print("About to transform data")
    if split == "year":
        transformers = [
            dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
        for transformer in transformers:
            train = transformer.transform(train_dataset)
            valid = transformer.transform(valid_dataset)
            test = transformer.transform(test_dataset)
    else:
        transformers = [
            dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)]
        for transformer in transformers:
            dataset = transformer.transform(dataset)

    splitters = {'index': dc.splits.IndexSplitter(),
                 'random': dc.splits.RandomSplitter(),
                 'scaffold': dc.splits.ScaffoldSplitter()}
    if split in splitters:
        splitter = splitters[split]
        print("Performing new split.")
        train, valid, test = splitter.train_valid_test_split(dataset)


    return chembl_tasks, (train, valid, test), transformers
Loading