Commit de39f971 authored by miaecle's avatar miaecle
Browse files

Merge remote-tracking branch 'remotes/origin/master'

parents 61a456ad 63610f09
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -32,3 +32,4 @@ from deepchem.models.tensorgraph.models.graph_models import WeaveTensorGraph, DT
from deepchem.models.tensorgraph.models.symmetry_function_regression import BPSymmetryFunctionRegression, ANIRegression

from deepchem.models.tensorgraph.models.seqtoseq import SeqToSeq
from deepchem.models.tensorgraph.models.text_cnn import TextCNNTensorGraph
+106 −0
Original line number Diff line number Diff line
@@ -480,6 +480,67 @@ class Dense(Layer):
      return self._shared_with._get_scope_name()


class Highway(Layer):
  """ Create a highway layer. y = H(x) * T(x) + x * (1 - T(x))
  H(x) = activation_fn(matmul(W_H, x) + b_H) is the non-linear transformed output
  T(x) = sigmoid(matmul(W_T, x) + b_T) is the transform gate

  reference: https://arxiv.org/pdf/1505.00387.pdf

  This layer expects its input to be a two dimensional tensor of shape (batch size, # input features).
  Outputs will be in the same shape.
  """

  def __init__(
      self,
      activation_fn=tf.nn.relu,
      biases_initializer=tf.zeros_initializer,
      weights_initializer=tf.contrib.layers.variance_scaling_initializer,
      **kwargs):
    """

    Parameters
    ----------
    activation_fn: object
      the Tensorflow activation function to apply to the output
    biases_initializer: callable object
      the initializer for bias values.  This may be None, in which case the layer
      will not include biases.
    weights_initializer: callable object
      the initializer for weight values
    """
    super(Highway, self).__init__(**kwargs)
    self.activation_fn = activation_fn
    self.biases_initializer = biases_initializer
    self.weights_initializer = weights_initializer

  def create_tensor(self, in_layers=None, set_tensors=True, **kwargs):
    inputs = self._get_input_tensors(in_layers)
    parent = inputs[0]
    shape = parent.get_shape().as_list()[1]
    # H(x), with same number of input and output channels
    dense_H = tf.contrib.layers.fully_connected(
        parent,
        num_outputs=shape,
        activation_fn=self.activation_fn,
        biases_initializer=self.biases_initializer(),
        weights_initializer=self.weights_initializer(),
        trainable=True)
    # T(x), with same number of input and output channels
    dense_T = tf.contrib.layers.fully_connected(
        parent,
        num_outputs=shape,
        activation_fn=tf.nn.sigmoid,
        biases_initializer=tf.constant_initializer(-1),
        weights_initializer=self.weights_initializer(),
        trainable=True)
    out_tensor = tf.multiply(dense_H, dense_T) + tf.multiply(
        parent, 1 - dense_T)
    if set_tensors:
      self.out_tensor = out_tensor
    return out_tensor


class Flatten(Layer):
  """Flatten every dimension except the first"""

@@ -1516,6 +1577,51 @@ class Conv3D(Layer):
    return out_tensor


class MaxPool1D(Layer):
  """A 1D max pooling on the input.

  This layer expects its input to be a three dimensional tensor of shape
  (batch size, width, # channels).
  """

  def __init__(self, window_shape=2, strides=1, padding="SAME", **kwargs):
    """Create a MaxPool1D layer.

    Parameters
    ----------
    window_shape: int, optional
      size of the window(assuming input with only one dimension)
    strides: int, optional
      stride of the sliding window
    padding: str
      the padding method to use, either 'SAME' or 'VALID'
    """
    self.window_shape = window_shape
    self.strides = strides
    self.padding = padding
    self.pooling_type = "MAX"
    super(MaxPool1D, self).__init__(**kwargs)
    try:
      parent_shape = self.in_layers[0].shape
      self._shape = tuple(None if p is None else p // s
                          for p, s in zip(parent_shape, strides))
    except:
      pass

  def create_tensor(self, in_layers=None, set_tensors=True, **kwargs):
    inputs = self._get_input_tensors(in_layers)
    in_tensor = inputs[0]
    out_tensor = tf.nn.pool(
        in_tensor,
        window_shape=[self.window_shape],
        pooling_type=self.pooling_type,
        padding=self.padding,
        strides=[self.strides])
    if set_tensors:
      self.out_tensor = out_tensor
    return out_tensor


class MaxPool2D(Layer):

  def __init__(self,
+274 −0
Original line number Diff line number Diff line
"""
Created on Thu Sep 28 15:17:50 2017

@author: zqwu
"""
import numpy as np
import tensorflow as tf
import copy

from deepchem.metrics import to_one_hot, from_one_hot
from deepchem.models.tensorgraph.layers import Dense, Concat, SoftMax, \
  SoftMaxCrossEntropy, BatchNorm, WeightedError, Dropout, BatchNormalization, \
  Conv1D, MaxPool1D, Squeeze, Stack, Highway
from deepchem.models.tensorgraph.graph_layers import DTNNEmbedding

from deepchem.models.tensorgraph.layers import L2Loss, Label, Weights, Feature
from deepchem.models.tensorgraph.tensor_graph import TensorGraph
from deepchem.trans import undo_transforms

# Common symbols in SMILES, note that Cl and Br are regarded as single symbol
default_dict = {
    '#': 1,
    '(': 2,
    ')': 3,
    '+': 4,
    '-': 5,
    '/': 6,
    '1': 7,
    '2': 8,
    '3': 9,
    '4': 10,
    '5': 11,
    '6': 12,
    '7': 13,
    '8': 14,
    '=': 15,
    'C': 16,
    'F': 17,
    'H': 18,
    'I': 19,
    'N': 20,
    'O': 21,
    'P': 22,
    'S': 23,
    '[': 24,
    '\\': 25,
    ']': 26,
    '_': 27,
    'c': 28,
    'Cl': 29,
    'Br': 30,
    'n': 31,
    'o': 32,
    's': 33
}


class TextCNNTensorGraph(TensorGraph):
  """ A Convolutional neural network on smiles strings
  Reimplementation of the discriminator module in ORGAN: https://arxiv.org/abs/1705.10843
  Originated from: http://emnlp2014.org/papers/pdf/EMNLP2014181.pdf

  This model applies multiple 1D convolutional filters to the padded strings,
  then max-over-time pooling is applied on all filters, extracting one feature per filter.
  All features are concatenated and transformed through several hidden layers to form predictions.

  This model is initially developed for sentence-level classification tasks, with
  words represented as vectors. In this implementation, SMILES strings are dissected
  into characters and transformed to one-hot vectors in a similar way. The model can
  be used for general molecular-level classification or regression tasks. It is also
  used in the ORGAN model as discriminator.
  
  Training of the model only requires SMILES strings input, all featurized datasets
  that include SMILES in the `ids` attribute are accepted. PDBbind, QM7 and QM7b 
  are not supported. To use the model, `build_char_dict` should be called first
  before defining the model to build character dict of input dataset, example can
  be found in examples/delaney/delaney_textcnn.py
  
  """

  def __init__(
      self,
      n_tasks,
      char_dict,
      seq_length,
      n_embedding=75,
      filter_sizes=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20],
      num_filters=[100, 200, 200, 200, 200, 100, 100, 100, 100, 100, 160, 160],
      dropout=0.25,
      mode="classification",
      **kwargs):
    """
    Parameters
    ----------
    n_tasks: int
      Number of tasks
    char_dict: dict
      Mapping from characters in smiles to integers
    seq_length: int
      Length of sequences(after padding)
    n_embedding: int, optional
      Length of embedding vector
    filter_sizes: list of int, optional
      Properties of filters used in the conv net
    num_filters: list of int, optional
      Properties of filters used in the conv net
    mode: str
      Either "classification" or "regression" for type of model.
    """
    self.n_tasks = n_tasks
    self.char_dict = char_dict
    self.seq_length = seq_length
    self.n_embedding = n_embedding
    self.filter_sizes = filter_sizes
    self.num_filters = num_filters
    self.dropout = dropout
    self.mode = mode
    super(TextCNNTensorGraph, self).__init__(**kwargs)
    self.build_graph()

  @staticmethod
  def build_char_dict(dataset, default_dict=default_dict):
    """ Collect all unique characters(in smiles) from the dataset.
    This method should be called before defining the model to build appropriate char_dict
    """
    # SMILES strings
    X = dataset.ids
    # Maximum length is expanded to allow length variation during train and inference
    seq_length = int(max([len(smile) for smile in X]) * 1.2)
    # '_' served as delimiter and padding
    all_smiles = '_'.join(X)
    tot_len = len(all_smiles)
    # Initialize common characters as keys
    keys = list(default_dict.keys())
    out_dict = copy.deepcopy(default_dict)
    current_key_val = len(keys) + 1
    # Include space to avoid extra keys
    keys.extend([' '])
    extra_keys = []
    i = 0
    while i < tot_len:
      # For 'Cl', 'Br', etc.
      if all_smiles[i:i + 2] in keys:
        i = i + 2
      elif all_smiles[i:i + 1] in keys:
        i = i + 1
      else:
        # Character not recognized, add to extra_keys
        extra_keys.append(all_smiles[i])
        keys.append(all_smiles[i])
        i = i + 1
    # Add all extra_keys to char_dict
    for extra_key in extra_keys:
      out_dict[extra_key] = current_key_val
      current_key_val += 1
    return out_dict, seq_length

  def build_graph(self):
    self.smiles_seqs = Feature(shape=(None, self.seq_length), dtype=tf.int32)
    # Character embedding
    self.Embedding = DTNNEmbedding(
        n_embedding=self.n_embedding,
        periodic_table_length=len(self.char_dict.keys()) + 1,
        in_layers=[self.smiles_seqs])
    self.pooled_outputs = []
    self.conv_layers = []
    for filter_size, num_filter in zip(self.filter_sizes, self.num_filters):
      # Multiple convolutional layers with different filter widths
      self.conv_layers.append(
          Conv1D(
              filter_size,
              num_filter,
              padding='VALID',
              in_layers=[self.Embedding]))
      # Max-over-time pooling
      self.pooled_outputs.append(
          MaxPool1D(
              window_shape=self.seq_length - filter_size + 1,
              strides=1,
              padding='VALID',
              in_layers=[self.conv_layers[-1]]))
    # Concat features from all filters(one feature per filter)
    concat_outputs = Concat(axis=2, in_layers=self.pooled_outputs)
    outputs = Squeeze(squeeze_dims=1, in_layers=concat_outputs)
    dropout = Dropout(dropout_prob=self.dropout, in_layers=[outputs])
    dense = Dense(
        out_channels=200, activation_fn=tf.nn.relu, in_layers=[dropout])
    # Highway layer from https://arxiv.org/pdf/1505.00387.pdf
    self.gather = Highway(in_layers=[dense])

    costs = []
    self.labels_fd = []
    for task in range(self.n_tasks):
      if self.mode == "classification":
        classification = Dense(
            out_channels=2, activation_fn=None, in_layers=[self.gather])
        softmax = SoftMax(in_layers=[classification])
        self.add_output(softmax)

        label = Label(shape=(None, 2))
        self.labels_fd.append(label)
        cost = SoftMaxCrossEntropy(in_layers=[label, classification])
        costs.append(cost)
      if self.mode == "regression":
        regression = Dense(
            out_channels=1, activation_fn=None, in_layers=[self.gather])
        self.add_output(regression)

        label = Label(shape=(None, 1))
        self.labels_fd.append(label)
        cost = L2Loss(in_layers=[label, regression])
        costs.append(cost)
    if self.mode == "classification":
      all_cost = Concat(in_layers=costs, axis=1)
    elif self.mode == "regression":
      all_cost = Stack(in_layers=costs, axis=1)
    self.weights = Weights(shape=(None, self.n_tasks))
    loss = WeightedError(in_layers=[all_cost, self.weights])
    self.set_loss(loss)

  def default_generator(self,
                        dataset,
                        epochs=1,
                        predict=False,
                        deterministic=True,
                        pad_batches=True):
    """ Transfer smiles strings to fixed length integer vectors
    """
    for epoch in range(epochs):
      if not predict:
        print('Starting epoch %i' % epoch)
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):

        feed_dict = dict()
        if y_b is not None and not predict:
          for index, label in enumerate(self.labels_fd):
            if self.mode == "classification":
              feed_dict[label] = to_one_hot(y_b[:, index])
            if self.mode == "regression":
              feed_dict[label] = y_b[:, index:index + 1]
        if w_b is not None:
          feed_dict[self.weights] = w_b
        # Transform SMILES string to integer vectors
        smiles_seqs = [self.smiles_to_seq(smiles) for smiles in ids_b]
        feed_dict[self.smiles_seqs] = np.stack(smiles_seqs, axis=0)
        yield feed_dict

  def smiles_to_seq(self, smiles):
    """ Tokenize characters in smiles to integers
    """
    smiles_len = len(smiles)
    seq = [0]
    keys = self.char_dict.keys()
    i = 0
    while i < smiles_len:
      # Skip all spaces
      if smiles[i:i + 1] == ' ':
        i = i + 1
      # For 'Cl', 'Br', etc.
      elif smiles[i:i + 2] in keys:
        seq.append(self.char_dict[smiles[i:i + 2]])
        i = i + 2
      elif smiles[i:i + 1] in keys:
        seq.append(self.char_dict[smiles[i:i + 1]])
        i = i + 1
      else:
        raise ValueError('character not found in dict')
    for i in range(self.seq_length - len(seq)):
      # Padding with '_'
      seq.append(self.char_dict['_'])
    return np.array(seq)
+72 −0
Original line number Diff line number Diff line
@@ -1380,6 +1380,78 @@ class TestOverfit(test_util.TensorFlowTestCase):

    assert scores[regression_metric.name] > .8

  def test_textCNN_singletask_classification_overfit(self):
    """Test textCNN model overfits tiny data."""
    np.random.seed(123)
    tf.set_random_seed(123)
    n_tasks = 1

    featurizer = dc.feat.RawFeaturizer()
    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir, "example_classification.csv")
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(input_file)

    classification_metric = dc.metrics.Metric(dc.metrics.accuracy_score)

    char_dict, length = dc.models.TextCNNTensorGraph.build_char_dict(dataset)
    batch_size = 10

    model = dc.models.TextCNNTensorGraph(
        n_tasks,
        char_dict,
        seq_length=length,
        batch_size=batch_size,
        learning_rate=0.001,
        use_queue=False,
        mode="classification")

    # Fit trained model
    model.fit(dataset, nb_epoch=200)

    # Eval model on train
    scores = model.evaluate(dataset, [classification_metric])

    assert scores[classification_metric.name] > .8

  def test_textCNN_singletask_regression_overfit(self):
    """Test textCNN model overfits tiny data."""
    np.random.seed(123)
    tf.set_random_seed(123)
    n_tasks = 1

    # Load mini log-solubility dataset.
    featurizer = dc.feat.RawFeaturizer()
    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir, "example_regression.csv")
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(input_file)

    regression_metric = dc.metrics.Metric(
        dc.metrics.pearson_r2_score, task_averager=np.mean)

    char_dict, length = dc.models.TextCNNTensorGraph.build_char_dict(dataset)
    batch_size = 10

    model = dc.models.TextCNNTensorGraph(
        n_tasks,
        char_dict,
        seq_length=length,
        batch_size=batch_size,
        learning_rate=0.001,
        use_queue=False,
        mode="regression")

    # Fit trained model
    model.fit(dataset, nb_epoch=200)

    # Eval model on train
    scores = model.evaluate(dataset, [regression_metric])

    assert scores[regression_metric.name] > .9

  def test_siamese_singletask_classification_overfit(self):
    """Test siamese singletask model overfits tiny data."""
    np.random.seed(123)
+47 −0
Original line number Diff line number Diff line
"""
Script that trains textCNN models on delaney dataset.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import numpy as np
np.random.seed(123)
import tensorflow as tf
tf.set_random_seed(123)
import deepchem as dc

# Load Delaney dataset
delaney_tasks, delaney_datasets, transformers = dc.molnet.load_delaney(
    featurizer='Raw', split='index')
train_dataset, valid_dataset, test_dataset = delaney_datasets

# Fit models
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)

char_dict, length = dc.models.TextCNNTensorGraph.build_char_dict(train_dataset)

# Batch size of models
batch_size = 64

model = dc.models.TextCNNTensorGraph(
    len(delaney_tasks),
    char_dict,
    seq_length=length,
    mode='regression',
    learning_rate=1e-3,
    batch_size=batch_size,
    use_queue=False)

# Fit trained model
model.fit(train_dataset, nb_epoch=100)

print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)