Commit 2481bf40 authored by miaecle's avatar miaecle
Browse files

docs and yapf

parent 13673b98
Loading
Loading
Loading
Loading
+24 −23
Original line number Diff line number Diff line
@@ -509,13 +509,15 @@ class Highway(Layer):
    inputs = self._get_input_tensors(in_layers)
    parent = inputs[0]
    shape = parent.get_shape().as_list()[1]
    dense1 = tf.contrib.layers.fully_connected(parent,
    dense1 = tf.contrib.layers.fully_connected(
        parent,
        num_outputs=shape,
        activation_fn=self.activation_fn,
        biases_initializer=self.biases_initializer(),
        weights_initializer=self.weights_initializer(),
        trainable=True)
    dense2 = tf.contrib.layers.fully_connected(parent,
    dense2 = tf.contrib.layers.fully_connected(
        parent,
        num_outputs=shape,
        activation_fn=tf.nn.sigmoid,
        biases_initializer=tf.constant_initializer(-1),
@@ -1562,13 +1564,10 @@ class Conv3D(Layer):
      self.out_tensor = out_tensor
    return out_tensor


class MaxPool1D(Layer):

  def __init__(self,
               window_shape=2,
               strides=1,
               padding="SAME",
               **kwargs):
  def __init__(self, window_shape=2, strides=1, padding="SAME", **kwargs):
    self.window_shape = window_shape
    self.strides = strides
    self.padding = padding
@@ -1584,7 +1583,8 @@ class MaxPool1D(Layer):
  def create_tensor(self, in_layers=None, set_tensors=True, **kwargs):
    inputs = self._get_input_tensors(in_layers)
    in_tensor = inputs[0]
    out_tensor = tf.nn.pool(in_tensor,
    out_tensor = tf.nn.pool(
        in_tensor,
        window_shape=[self.window_shape],
        pooling_type=self.pooling_type,
        padding=self.padding,
@@ -1593,6 +1593,7 @@ class MaxPool1D(Layer):
      self.out_tensor = out_tensor
    return out_tensor


class MaxPool2D(Layer):

  def __init__(self,
+86 −71
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@ from deepchem.models.tensorgraph.layers import L2Loss, Label, Weights, Feature
from deepchem.models.tensorgraph.tensor_graph import TensorGraph
from deepchem.trans import undo_transforms

# Common symbols in SMILES, note that Cl and Br are regarded as one symbol
default_dict = {
    '#': 1,
    '(': 2,
@@ -52,11 +53,14 @@ default_dict = {
    'Br': 30,
    'n': 31,
    'o': 32,
 's': 33}
    's': 33
}


class TextCNNTensorGraph(TensorGraph):

  def __init__(self,
  def __init__(
      self,
      n_tasks,
      char_dict,
      seq_length,
@@ -72,7 +76,7 @@ class TextCNNTensorGraph(TensorGraph):
    n_tasks: int
      Number of tasks
    char_dict: dict
      Mapping from characters in smiles to integer tokens
      Mapping from characters in smiles to integers
    seq_length: int
      Length of sequences(after padding)
    n_embedding: int, optional
@@ -97,6 +101,8 @@ class TextCNNTensorGraph(TensorGraph):

  @staticmethod
  def build_char_dict(dataset, default_dict=default_dict):
    """ Collect all unique characters(in smiles) from the dataset.
    """
    X = dataset.ids
    seq_length = int(max([len(smile) for smile in X]) * 1.2)
    all_smiles = '_'.join(X)
@@ -104,7 +110,7 @@ class TextCNNTensorGraph(TensorGraph):
    keys = default_dict.keys()
    out_dict = copy.deepcopy(default_dict)
    current_key_val = len(keys) + 1
    keys.extend(['_', ' '])
    keys.extend([' '])
    extra_keys = []
    i = 0
    while i < tot_len:
@@ -119,31 +125,39 @@ class TextCNNTensorGraph(TensorGraph):
    for extra_key in extra_keys:
      out_dict[extra_key] = current_key_val
      current_key_val += 1
    out_dict['_'] = current_key_val
    return out_dict, seq_length

  def build_graph(self):
    self.smiles_seqs = Feature(shape=(None, self.seq_length), dtype=tf.int32)
    self.Embedding = DTNNEmbedding(n_embedding=self.n_embedding,
    # Character embedding
    self.Embedding = DTNNEmbedding(
        n_embedding=self.n_embedding,
        periodic_table_length=len(self.char_dict.keys()) + 1,
        in_layers=[self.smiles_seqs])
    self.pooled_outputs = []
    self.conv_layers = []
    for filter_size, num_filter in zip(self.filter_sizes, self.num_filters):
      self.conv_layers.append(Conv1D(filter_size,
      # Multiple convolutional layers with different filter widths
      self.conv_layers.append(
          Conv1D(
              filter_size,
              num_filter,
              padding='VALID',
              in_layers=[self.Embedding]))
      self.pooled_outputs.append(MaxPool1D(window_shape=self.seq_length-filter_size+1,
      # Max-over-time pooling
      self.pooled_outputs.append(
          MaxPool1D(
              window_shape=self.seq_length - filter_size + 1,
              strides=1,
              padding='VALID',
              in_layers=[self.conv_layers[-1]]))

    concat_outputs = Concat(axis=2, in_layers=self.pooled_outputs)
    outputs = Squeeze(squeeze_dims=1, in_layers=concat_outputs)
    #HIGHWAY LAYER
    dropout = Dropout(dropout_prob=self.dropout, in_layers=[outputs])
    dense = Dense(out_channels=200, activation_fn=tf.nn.relu, in_layers=[dropout])
    dense = Dense(
        out_channels=200, activation_fn=tf.nn.relu, in_layers=[dropout])
    # Highway layer from https://arxiv.org/pdf/1505.00387.pdf
    self.gather = Highway(in_layers=[dense])

    costs = []
@@ -176,15 +190,13 @@ class TextCNNTensorGraph(TensorGraph):
    loss = WeightedError(in_layers=[all_cost, self.weights])
    self.set_loss(loss)


  def default_generator(self,
                        dataset,
                        epochs=1,
                        predict=False,
                        deterministic=True,
                        pad_batches=True):
    """ TensorGraph style implementation
    similar to deepchem.models.tf_new_models.graph_topology.AlternateWeaveTopology.batch_to_feed_dict
    """ Transfer smiles strings to fixed length integer vectors
    """
    for epoch in range(epochs):
      if not predict:
@@ -209,8 +221,9 @@ class TextCNNTensorGraph(TensorGraph):
        yield feed_dict

  def smiles_to_seq(self, smiles):
    """ Tokenize characters in smiles to integers
    """
    smiles_len = len(smiles)
    # Starting token
    seq = [0]
    keys = self.char_dict.keys()
    i = 0
@@ -218,6 +231,7 @@ class TextCNNTensorGraph(TensorGraph):
      if smiles[i:i + 1] == ' ':
        i = i + 1
      elif smiles[i:i + 2] in keys:
        # For Cl, Br, etc.
        seq.append(self.char_dict[smiles[i:i + 2]])
        i = i + 2
      elif smiles[i:i + 1] in keys:
@@ -226,5 +240,6 @@ class TextCNNTensorGraph(TensorGraph):
      else:
        raise ValueError('character not found in dict')
    for i in range(self.seq_length - len(seq)):
      # Padding
      seq.append(self.char_dict['_'])
    return np.array(seq)
+72 −0
Original line number Diff line number Diff line
@@ -1380,6 +1380,78 @@ class TestOverfit(test_util.TensorFlowTestCase):

    assert scores[regression_metric.name] > .8

  def test_textCNN_singletask_classification_overfit(self):
    """Test textCNN model overfits tiny data."""
    np.random.seed(123)
    tf.set_random_seed(123)
    n_tasks = 1

    featurizer = dc.feat.RawFeaturizer()
    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir, "example_classification.csv")
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(input_file)

    classification_metric = dc.metrics.Metric(dc.metrics.accuracy_score)

    char_dict, length = dc.models.TextCNNTensorGraph.build_char_dict(dataset)
    batch_size = 10

    model = dc.models.TextCNNTensorGraph(
        n_tasks,
        char_dict,
        seq_length=length,
        batch_size=batch_size,
        learning_rate=0.001,
        use_queue=False,
        mode="classification")

    # Fit trained model
    model.fit(dataset, nb_epoch=200)

    # Eval model on train
    scores = model.evaluate(dataset, [classification_metric])

    assert scores[classification_metric.name] > .8

  def test_textCNN_singletask_regression_overfit(self):
    """Test textCNN model overfits tiny data."""
    np.random.seed(123)
    tf.set_random_seed(123)
    n_tasks = 1

    # Load mini log-solubility dataset.
    featurizer = dc.feat.RawFeaturizer()
    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir, "example_regression.csv")
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(input_file)

    regression_metric = dc.metrics.Metric(
        dc.metrics.pearson_r2_score, task_averager=np.mean)

    char_dict, length = dc.models.TextCNNTensorGraph.build_char_dict(dataset)
    batch_size = 10

    model = dc.models.TextCNNTensorGraph(
        n_tasks,
        char_dict,
        seq_length=length,
        batch_size=batch_size,
        learning_rate=0.001,
        use_queue=False,
        mode="regression")

    # Fit trained model
    model.fit(dataset, nb_epoch=200)

    # Eval model on train
    scores = model.evaluate(dataset, [regression_metric])

    assert scores[regression_metric.name] > .9

  def test_siamese_singletask_classification_overfit(self):
    """Test siamese singletask model overfits tiny data."""
    np.random.seed(123)
+1 −1
Original line number Diff line number Diff line
@@ -34,7 +34,7 @@ model = dc.models.TextCNNTensorGraph(
    use_queue=False)

# Fit trained model
model.fit(train_dataset, nb_epoch=50)
model.fit(train_dataset, nb_epoch=100)

print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)