Commit 8be6317b authored by leswing's avatar leswing
Browse files

GOGO

parent 973a0123
Loading
Loading
Loading
Loading
+24 −3
Original line number Diff line number Diff line
@@ -507,14 +507,14 @@ class AspuruGuzikAutoEncoder(SeqToSeq):
      if dropout > 0.0:
        prev_layer = layers.Dropout(dropout, in_layers=prev_layer)
      prev_layer = layers.Conv1D(
          kernel_size,
          filter_size,
          filters=filter_size,
          kernel_size=kernel_size,
          in_layers=prev_layer,
          activation_fn=tf.nn.relu)
    prev_layer = layers.Flatten(prev_layer)
    prev_layer = layers.Dense(
        self._decoder_dimension, in_layers=prev_layer, activation_fn=tf.nn.relu)
    prev_layer = layers.BatchNorm(prev_layer)
    #prev_layer = layers.BatchNorm(prev_layer)
    if self._variational:
      self._embedding_mean = layers.Dense(
          self._embedding_dimension,
@@ -587,3 +587,24 @@ class AspuruGuzikAutoEncoder(SeqToSeq):
        for i in range(len(batch)):
          result.append(self._beam_search(probs[i], beam_width))
    return result

  def predict_embeddings(self, sequences):
    """Given a set of input sequences, compute the embedding vectors.

    Parameters
    ----------
    sequences: iterable
      the input sequences to generate an embedding vector for
    """
    result = []
    with self._get_tf("Graph").as_default():
      for batch in self._batch_elements(sequences):
        feed_dict = {}
        feed_dict[self._features] = self._create_output_array(batch)
        feed_dict[self._training_placeholder] = 0.0
        for initial, zero in zip(self.rnn_initial_states, self.rnn_zero_states):
          feed_dict[initial] = zero
        embeddings = self.session.run(self.embedding, feed_dict=feed_dict)
        for i in range(len(batch)):
          result.append(embeddings[i])
    return np.array(result, dtype=np.float32)
+46 −0
Original line number Diff line number Diff line
@@ -58,6 +58,52 @@ class TestSeqToSeq(unittest.TestCase):
    assert count1 >= 12
    assert count4 >= 12

  def test_aspuru_guzik(self):
    """Test that the aspuru_guzik encoder doesn't hard error.
    This model takes too long to fit to do an overfit test
    """
    train_smiles = [
        'Cc1cccc(N2CCN(C(=O)C34CC5CC(CC(C5)C3)C4)CC2)c1C',
        'Cn1ccnc1SCC(=O)Nc1ccc(Oc2ccccc2)cc1',
        'COc1cc2c(cc1NC(=O)CN1C(=O)NC3(CCc4ccccc43)C1=O)oc1ccccc12',
        'O=C1/C(=C/NC2CCS(=O)(=O)C2)c2ccccc2C(=O)N1c1ccccc1',
        'NC(=O)NC(Cc1ccccc1)C(=O)O', 'CCn1c(CSc2nccn2C)nc2cc(C(=O)O)ccc21',
        'CCc1cccc2c1NC(=O)C21C2C(=O)N(Cc3ccccc3)C(=O)C2C2CCCN21',
        'COc1ccc(C2C(C(=O)NCc3ccccc3)=C(C)N=C3N=CNN32)cc1OC',
        'CCCc1cc(=O)nc(SCC(=O)N(CC(C)C)C2CCS(=O)(=O)C2)[nH]1',
        'CCn1cnc2c1c(=O)n(CC(=O)Nc1cc(C)on1)c(=O)n2Cc1ccccc1'
    ]
    tokens = set()
    for s in train_smiles:
      tokens = tokens.union(set(c for c in s))
    tokens = sorted(list(tokens))
    max_length = max(len(s) for s in train_smiles) + 1
    s = dc.models.tensorgraph.models.seqtoseq.AspuruGuzikAutoEncoder(
        tokens, max_length)

    def generate_sequences(smiles, epochs):
      for i in range(epochs):
        for s in smiles:
          yield (s, s)

    s.fit_sequences(generate_sequences(train_smiles, 100))

    # Test it out.
    pred1 = s.predict_from_sequences(train_smiles, beam_width=1)
    pred4 = s.predict_from_sequences(train_smiles, beam_width=4)
    embeddings = s.predict_embeddings(train_smiles)
    pred1e = s.predict_from_embeddings(embeddings, beam_width=1)
    pred4e = s.predict_from_embeddings(embeddings, beam_width=4)
    count1 = 0
    count4 = 0
    for i in range(len(train_smiles)):
      if pred1[i] == train_smiles[i]:
        count1 += 1
      if pred4[i] == train_smiles[i]:
        count4 += 1
      assert pred1[i] == pred1e[i]
      assert pred4[i] == pred4e[i]

  def test_variational(self):
    """Test using a SeqToSeq model as a variational autoenconder."""