Commit cea0f862 authored by Atreya Majumdar's avatar Atreya Majumdar
Browse files

Merge branch 'master' of https://github.com/deepchem/deepchem into sparse_adam

parents 25ece837 2b136d22
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@ from deepchem.models.scscore import ScScoreModel

from deepchem.models.seqtoseq import SeqToSeq
from deepchem.models.gan import GAN, WGAN
from deepchem.models.molgan import BasicMolGANModel
from deepchem.models.cnn import CNN
from deepchem.models.text_cnn import TextCNNModel
from deepchem.models.atomic_conv import AtomicConvModel
+351 −0
Original line number Diff line number Diff line
from typing import List, Tuple, Any

import tensorflow as tf
from deepchem.feat.molecule_featurizers.molgan_featurizer import GraphMatrix
from deepchem.models import WGAN
from deepchem.models.layers import MolGANEncoderLayer
from tensorflow import keras
from tensorflow.keras import layers


class BasicMolGANModel(WGAN):
  """
  Model for de-novo generation of small molecules based on work of Nicola De Cao et al. [1]_.
  Utilizes WGAN infrastructure; uses adjacency matrix and node features as inputs.
  Inputs need to be one-hot representation.

  Examples
  --------
  >>>
  >> import deepchem as dc
  >> from deepchem.models import BasicMolGANModel as MolGAN
  >> from deepchem.models.optimizers import ExponentialDecay
  >> from tensorflow import one_hot
  >> smiles = ['CCC', 'C1=CC=CC=C1', 'CNC' ]
  >> # create featurizer
  >> feat = dc.feat.MolGanFeaturizer()
  >> # featurize molecules
  >> features = feat.featurize(smiles)
  >> # Remove empty objects
  >> features = list(filter(lambda x: x is not None, features))
  >> # create model
  >> gan = MolGAN(learning_rate=ExponentialDecay(0.001, 0.9, 5000))
  >> dataset = dc.data.NumpyDataset([x.adjacency_matrix for x in features],[x.node_features for x in features])
  >> def iterbatches(epochs):
  >>     for i in range(epochs):
  >>         for batch in dataset.iterbatches(batch_size=gan.batch_size, pad_batches=True):
  >>             adjacency_tensor = one_hot(batch[0], gan.edges)
  >>             node_tensor = one_hot(batch[1], gan.nodes)
  >>             yield {gan.data_inputs[0]: adjacency_tensor, gan.data_inputs[1]:node_tensor}
  >> gan.fit_gan(iterbatches(8), generator_steps=0.2, checkpoint_interval=5000)
  >> generated_data = gan.predict_gan_generator(1000)
  >> # convert graphs to RDKitmolecules
  >> nmols = feat.defeaturize(generated_data)
  >> print("{} molecules generated".format(len(nmols)))
  >> # remove invalid moles
  >> nmols = list(filter(lambda x: x is not None, nmols))
  >> # currently training is unstable so 0 is a common outcome
  >> print ("{} valid molecules".format(len(nmols)))

  References
  ----------
  .. [1] Nicola De Cao et al. "MolGAN: An implicit generative model
  for small molecular graphs", https://arxiv.org/abs/1805.11973
  """

  def __init__(self,
               edges: int = 5,
               vertices: int = 9,
               nodes: int = 5,
               embedding_dim: int = 10,
               dropout_rate: float = 0.0,
               **kwargs):
    """
    Initialize the model

    Parameters
    ----------
    edges: int, default 5
        Number of bond types includes BondType.Zero
    vertices: int, default 9
        Max number of atoms in adjacency and node features matrices
    nodes: int, default 5
        Number of atom types in node features matrix
    embedding_dim: int, default 10
        Size of noise input array
    dropout_rate: float, default = 0.
        Rate of dropout used across whole model
    name: str, default ''
        Name of the model
    """

    self.edges = edges
    self.vertices = vertices
    self.nodes = nodes
    self.embedding_dim = embedding_dim
    self.dropout_rate = dropout_rate

    super(BasicMolGANModel, self).__init__(**kwargs)

  def get_noise_input_shape(self) -> Tuple[int]:
    """
    Return shape of the noise input used in generator

    Returns
    -------
    Tuple
        Shape of the noise input
    """

    return (self.embedding_dim,)

  def get_data_input_shapes(self) -> List:
    """
    Return input shape of the discriminator

    Returns
    -------
    List
        List of shapes used as an input for distriminator.
    """
    return [
        (self.vertices, self.vertices, self.edges),
        (self.vertices, self.nodes),
    ]

  def create_generator(self) -> keras.Model:
    """
    Create generator model.
    Take noise data as an input and processes it through number of
    dense and dropout layers. Then data is converted into two forms
    one used for training and other for generation of compounds.
    The model has two outputs:
      1. edges
      2. nodes
    The format differs depending on intended use (training or sample generation).
    For sample generation use flag, sample_generation=True while calling generator
    i.e. gan.generators[0](noise_input, training=False, sample_generation=True).
    In case of training, not flag is necessary.
    """
    return BasicMolGANGenerator(
        vertices=self.vertices,
        edges=self.edges,
        nodes=self.nodes,
        dropout_rate=self.dropout_rate,
        embedding_dim=self.embedding_dim)

  def create_discriminator(self) -> keras.Model:
    """
    Create discriminator model based on MolGAN layers.
    Takes two inputs:
      1. adjacency tensor, containing bond information
      2. nodes tensor, containing atom information
    The input vectors need to be in one-hot encoding format.
    Use MolGAN featurizer for that purpose. It will be simplified
    in the future release.
    """
    adjacency_tensor = layers.Input(
        shape=(self.vertices, self.vertices, self.edges))
    node_tensor = layers.Input(shape=(self.vertices, self.nodes))

    graph = MolGANEncoderLayer(
        units=[(128, 64), 128],
        dropout_rate=self.dropout_rate,
        edges=self.edges)([adjacency_tensor, node_tensor])
    dense = layers.Dense(units=128, activation="tanh")(graph)
    dense = layers.Dropout(self.dropout_rate)(dense)
    dense = layers.Dense(units=64, activation="tanh")(dense)
    dense = layers.Dropout(self.dropout_rate)(dense)
    output = layers.Dense(units=1)(dense)

    return keras.Model(
        inputs=[
            adjacency_tensor,
            node_tensor,
        ], outputs=[output])

  def predict_gan_generator(self,
                            batch_size: int = 1,
                            noise_input: List = None,
                            conditional_inputs: List = [],
                            generator_index: int = 0) -> List[GraphMatrix]:
    """
    Use the GAN to generate a batch of samples.

    Parameters
    ----------
    batch_size: int
      the number of samples to generate.  If either noise_input or
      conditional_inputs is specified, this argument is ignored since the batch
      size is then determined by the size of that argument.
    noise_input: array
      the value to use for the generator's noise input.  If None (the default),
      get_noise_batch() is called to generate a random input, so each call will
      produce a new set of samples.
    conditional_inputs: list of arrays
      NOT USED.
      the values to use for all conditional inputs.  This must be specified if
      the GAN has any conditional inputs.
    generator_index: int
      NOT USED.
      the index of the generator (between 0 and n_generators-1) to use for
      generating the samples.

    Returns
    -------
    List[GraphMatrix]
      Returns a list of GraphMatrix object that can be converted into
      RDKit molecules using MolGANFeaturizer defeaturize function.
    """

    if noise_input is not None:
      batch_size = len(noise_input)
    if noise_input is None:
      noise_input = self.get_noise_batch(batch_size)
    print(f"Generating {batch_size} samples")
    adjacency_matrix, nodes_features = self.generators[0](
        noise_input, training=False, sample_generation=True)
    graphs = [
        GraphMatrix(i, j)
        for i, j in zip(adjacency_matrix.numpy(), nodes_features.numpy())
    ]
    return graphs


class BasicMolGANGenerator(tf.keras.Model):
  """
  Generator class for BasicMolGAN model.
  Using subclassing rather than functional API due to requirement
  to swap between two outputs depending on situation.
  In order to get output that used for sample generation
  (conversion to rdkit molecules) pass sample_generation=True argument while
  calling the model i.e. adjacency_matrix, nodes_features = self.generators[0](
  noise_input, training=False, sample_generation=True)
  This is automatically done in predict_gan_generator().
  """

  def __init__(self,
               vertices: int = 9,
               edges: int = 5,
               nodes: int = 5,
               dropout_rate: float = 0.,
               embedding_dim: int = 10,
               name: str = "SimpleMolGANGenerator",
               **kwargs):
    """
    Initialize model.

    Parameters
    ----------
    vertices : int, optional
        number of max atoms dataset molecules (incl. empty atom), by default 9
    edges : int, optional
        number of bond types in molecules, by default 5
    nodes : int, optional
        number of atom types in molecules, by default 5
    dropout_rate : float, optional
        rate of dropout, by default 0.
    embedding_dim : int, optional
        noise input dimensions, by default 10
    name : str, optional
        name of the model, by default "SimpleMolGANGenerator"
    """
    super(BasicMolGANGenerator, self).__init__(name=name, **kwargs)
    self.vertices = vertices
    self.edges = edges
    self.nodes = nodes
    self.dropout_rate = dropout_rate
    self.embedding_dim = embedding_dim

    self.dense1 = layers.Dense(
        128, activation="tanh", input_shape=(self.embedding_dim,))
    self.dropout1 = layers.Dropout(self.dropout_rate)
    self.dense2 = layers.Dense(256, activation="tanh")
    self.dropout2 = layers.Dropout(self.dropout_rate)
    self.dense3 = layers.Dense(512, activation="tanh")
    self.dropout3 = layers.Dropout(self.dropout_rate)

    # edges logits used during training
    self.edges_dense = layers.Dense(
        units=self.edges * self.vertices * self.vertices, activation=None)
    self.edges_reshape = layers.Reshape((self.edges, self.vertices,
                                         self.vertices))
    self.edges_matrix_transpose1 = layers.Permute((1, 3, 2))
    self.edges_matrix_transpose2 = layers.Permute((2, 3, 1))
    self.edges_dropout = layers.Dropout(self.dropout_rate)

    # nodes logits used during training
    self.nodes_dense = layers.Dense(
        units=(self.vertices * self.nodes), activation=None)
    self.nodes_reshape = layers.Reshape((self.vertices, self.nodes))
    self.nodes_dropout = layers.Dropout(self.dropout_rate)

  def call(self,
           inputs: Any,
           training: bool = False,
           sample_generation: bool = False) -> List[Any]:
    """
    Call generator model

    Parameters
    ----------
    inputs : Any
        List of inputs, typically noise_batch
    training : bool, optional
        used by dropout layers, by default False
    sample_generation : bool, optional
        decide which output to use, by default False

    Returns
    -------
    List[Any, Any]
        Tensors containing either softmax values for training
        or argmax for sample generation (used for creation of rdkit molecules).
    """

    x = self.dense1(inputs)
    x = self.dropout1(x)
    x = self.dense2(x)
    x = self.dropout2(x)
    x = self.dense3(x)
    x = self.dropout3(x)

    # edges logits
    edges_logits = self.edges_dense(x)
    edges_logits = self.edges_reshape(edges_logits)
    matrix_transpose = self.edges_matrix_transpose1(edges_logits)
    edges_logits = (edges_logits + matrix_transpose) / 2
    edges_logits = self.edges_matrix_transpose2(edges_logits)
    edges_logits = self.edges_dropout(edges_logits)

    # nodes logits
    nodes_logits = self.nodes_dense(x)
    nodes_logits = self.nodes_reshape(nodes_logits)
    nodes_logits = self.nodes_dropout(nodes_logits)

    if sample_generation is False:
      # training of the model
      edges = tf.nn.softmax(edges_logits)
      nodes = tf.nn.softmax(nodes_logits)
    else:
      # generating compounds
      e_gumbel_logits = edges_logits - tf.math.log(-tf.math.log(
          tf.random.uniform(tf.shape(edges_logits), dtype=edges_logits.dtype)))
      e_gumbel_argmax = tf.one_hot(
          tf.argmax(e_gumbel_logits, axis=-1),
          depth=e_gumbel_logits.shape[-1],
          dtype=e_gumbel_logits.dtype,
      )
      edges = tf.argmax(e_gumbel_argmax, axis=-1)

      # nodes logits used during compound generation
      n_gumbel_logits = nodes_logits - tf.math.log(-tf.math.log(
          tf.random.uniform(tf.shape(nodes_logits), dtype=nodes_logits.dtype)))
      n_gumbel_argmax = tf.one_hot(
          tf.argmax(n_gumbel_logits, axis=-1),
          depth=n_gumbel_logits.shape[-1],
          dtype=n_gumbel_logits.dtype,
      )
      nodes = tf.argmax(n_gumbel_argmax, axis=-1)

    return [edges, nodes]
+24 −0
Original line number Diff line number Diff line
"Molecule"
"C1=CC=CC=C1"
"FC1=CC=CC=C1"
"FC1=CC(F)=CC=C1"
"FC1=CC=C(F)C=C1"
"CC1=CC(F)=CC=C1"
"OC1=CC(F)=CC=C1"
"NC1=CC(F)=CC=C1"
"CC1=CC(C)=CC=C1"
"CC1=CC=CC(O)=C1"
"CC1=CC=CC(N)=C1"
"OC1=CC(O)=CC=C1"
"NC1=CC(O)=CC=C1"
"NC1=CC(N)=CC=C1"
"CC1=CC=CC=C1"
"OC1=CC=CC=C1"
"OC1=CC=CC(F)=C1"
"NC1=CC=CC=C1"
"NC1=CC=CC(F)=C1"
"CC1=CC=C(F)C=C1"
"OC1=CC=C(F)C=C1"
"NC1=CC=CC(O)=C1"
"NC1=CC=C(F)C=C1"
"CC1=CC=C(C)C=C1"
+132 −0
Original line number Diff line number Diff line
import os
import unittest

import pandas as pd
from deepchem.data import NumpyDataset
from deepchem.feat.molecule_featurizers import MolGanFeaturizer
from deepchem.models import BasicMolGANModel as MolGAN
from deepchem.models.optimizers import ExponentialDecay
from tensorflow import one_hot
from tensorflow.keras.backend import clear_session as keras_clear_session


class test_molgan_model(unittest.TestCase):
  """
  Unit testing for MolGAN basic layers
  """

  def setUp(self):
    self.training_attempts = 6
    self.current_dir = os.path.dirname(os.path.abspath(__file__))
    self.vertices = 9
    self.nodes = 5
    self.edges = 5
    self.embedding_dim = 10
    self.dropout_rate = 0.0
    self.batch_size = 100
    self.first_convolution_unit = 128
    self.second_convolution_unit = 64
    self.aggregation_unit = 128
    self.model = MolGAN(
        edges=self.edges,
        vertices=self.vertices,
        nodes=self.nodes,
        embedding_dim=self.embedding_dim,
        dropout_rate=self.dropout_rate)

  def test_build(self):
    """
    Test if initialization data is set-up correctly
    """
    model = self.model
    assert model.batch_size == self.batch_size
    assert model.edges == self.edges
    assert model.nodes == self.nodes
    assert model.vertices == self.vertices
    assert model.dropout_rate == self.dropout_rate
    assert len(model.generators) == 1
    assert len(model.discriminators) == 1

  def test_shapes(self):
    """
    Check if input and output shapes are correct
    """
    model = self.model

    # test if adjacency matrix input is correctly set
    assert model.discriminators[0].input_shape[0] == (None, self.vertices,
                                                      self.vertices, self.edges)
    # test if nodes features matrix input is correctly set
    assert model.discriminators[0].input_shape[1] == (None, self.vertices,
                                                      self.edges)
    # check discriminator shape
    assert model.discriminators[0].output_shape == (None, 1)
    # check training edges logits shape
    assert model.generators[0].output_shape[0] == (None, self.vertices,
                                                   self.vertices, self.edges)
    # check training nodes logits shapes
    assert model.generators[0].output_shape[1] == (None, self.vertices,
                                                   self.nodes)

  def test_training(self):
    """
    Check training of the basicMolGANmodel on small number of compounds.
    Due to training instability try a few times and see if it worked at least once.
    Typically it fails between 1-3 times of 10.
    This is something that needs to be addressed in future releases.
    """

    input_file = os.path.join(self.current_dir, "molgan_example.csv")
    data = pd.read_csv(input_file)
    molecules = list(data['Molecule'])
    feat = MolGanFeaturizer()
    featurized = feat.featurize(molecules)
    dataset = NumpyDataset([x.adjacency_matrix for x in featurized],
                           [x.node_features for x in featurized])

    # True will be assigned up successful training attempt
    success = False

    for _ in range(self.training_attempts):
      # force clear tensor flow backend
      keras_clear_session()
      # create new model
      gan = MolGAN(learning_rate=ExponentialDecay(0.001, 0.9, 5000))

      # to avoid flake8 E125/yapf incompatibility
      s = gan.batch_size

      # generate input
      def iterbatches(epochs):
        for __ in range(epochs):
          for batch in dataset.iterbatches(batch_size=s, pad_batches=True):
            adjacency_tensor = one_hot(batch[0], gan.edges)
            node_tesor = one_hot(batch[1], gan.nodes)

            yield {
                gan.data_inputs[0]: adjacency_tensor,
                gan.data_inputs[1]: node_tesor
            }

      # train model
      gan.fit_gan(iterbatches(1000), generator_steps=0.2, checkpoint_interval=0)

      # generate sample
      g = gan.predict_gan_generator(1000)

      # check how many valid molecules were created and add to list
      generated_molecules = feat.defeaturize(g)
      valid_molecules_count = len(
          list(filter(lambda x: x is not None, generated_molecules)))
      print(valid_molecules_count)
      if valid_molecules_count:
        success = True
        break

    # finally test if there was at least one valid training session
    # as the model structure improves this should become more and more strict
    assert success


if __name__ == '__main__':
  unittest.main()
+6 −0
Original line number Diff line number Diff line
@@ -356,6 +356,12 @@ MPNNModel
.. autoclass:: deepchem.models.MPNNModel
  :members:

BasicMolGANModel
---------

.. autoclass:: deepchem.models.BasicMolGANModel
  :members:

ScScoreModel
------------