Commit 3fbd4a83 authored by miaecle's avatar miaecle
Browse files

DTNN build

parent 3bd2083a
Loading
Loading
Loading
Loading
+195 −0
Original line number Diff line number Diff line
import os
import sys
import numpy as np
import tensorflow as tf
import sklearn.metrics
import tempfile
from deepchem.utils.save import log
from deepchem.models import Model
from deepchem.nn.copy import Input
from deepchem.nn.copy import Dense
from deepchem.data import pad_features
from deepchem.nn import model_ops
# TODO(rbharath): Find a way to get rid of this import?
from deepchem.models.tf_new_models.graph_topology import merge_dicts
from deepchem.models.tf_new_models.multitask_classifier import get_loss_fn


class MultitaskGraphRegressor(Model):

  def __init__(self,
               model,
               n_tasks,
               n_feat,
               logdir=None,
               batch_size=50,
               final_loss='weighted_L2',
               learning_rate=.001,
               optimizer_type="adam",
               learning_rate_decay_time=1000,
               beta1=.9,
               beta2=.999,
               pad_batches=True,
               verbose=True):

    self.verbose = verbose
    self.n_tasks = n_tasks
    self.final_loss = final_loss
    self.model = model
    self.sess = tf.Session(graph=self.model.graph)
    if logdir is not None:
      if not os.path.exists(logdir):
        os.makedirs(logdir)
    else:
      logdir = tempfile.mkdtemp()
    self.logdir = logdir

    with self.model.graph.as_default():
      # Extract model info 
      self.batch_size = batch_size
      self.pad_batches = pad_batches
      # Get graph topology for x
      self.graph_topology = self.model.get_graph_topology()
      self.feat_dim = n_feat

      # Building outputs
      self.outputs = self.build()
      self.loss_op = self.add_training_loss(self.final_loss, self.outputs)

      self.learning_rate = learning_rate
      self.T = learning_rate_decay_time
      self.optimizer_type = optimizer_type

      self.optimizer_beta1 = beta1
      self.optimizer_beta2 = beta2

      # Set epsilon
      self.epsilon = 1e-7
      self.add_optimizer()

      # Initialize
      self.init_fn = tf.initialize_all_variables()
      self.sess.run(self.init_fn)

      # Path to save checkpoint files, which matches the
      # replicated supervisor's default path.
      self._save_path = os.path.join(logdir, 'model.ckpt')

  def build(self):
    # Create target inputs
    self.label_placeholder = tf.placeholder(
        dtype='float32', shape=(None, self.n_tasks), name="label_placeholder")
    self.weight_placeholder = tf.placeholder(
        dtype='float32', shape=(None, self.n_tasks), name="weight_placholder")

    feat = self.model.return_outputs()
    return feat

  def add_optimizer(self):
    if self.optimizer_type == "adam":
      self.optimizer = tf.train.AdamOptimizer(
          self.learning_rate,
          beta1=self.optimizer_beta1,
          beta2=self.optimizer_beta2,
          epsilon=self.epsilon)
    else:
      raise ValueError("Optimizer type not recognized.")

    # Get train function
    self.train_op = self.optimizer.minimize(self.loss_op)

  def construct_feed_dict(self, X_b, y_b=None, w_b=None, training=True):
    """Get initial information about task normalization"""
    # TODO(rbharath): I believe this is total amount of data
    n_samples = len(X_b)
    if y_b is None:
      y_b = np.zeros((n_samples, self.n_tasks))
    if w_b is None:
      w_b = np.zeros((n_samples, self.n_tasks))
    targets_dict = {self.label_placeholder: y_b, self.weight_placeholder: w_b}

    # Get graph information
    atoms_dict = self.graph_topology.batch_to_feed_dict(X_b)

    # TODO (hraut->rhbarath): num_datapoints should be a vector, with ith element being
    # the number of labeled data points in target_i. This is to normalize each task
    # num_dat_dict = {self.num_datapoints_placeholder : self.}

    # Get other optimizer information
    # TODO(rbharath): Figure out how to handle phase appropriately
    feed_dict = merge_dicts([targets_dict, atoms_dict])
    return feed_dict

  def add_training_loss(self, final_loss, outputs):
    """Computes loss using logits."""
    loss_fn = get_loss_fn(final_loss)  # Get loss function
    task_losses = []
    # label_placeholder of shape (batch_size, n_tasks). Split into n_tasks
    # tensors of shape (batch_size,)
    task_labels = tf.split(1, self.n_tasks, self.label_placeholder)
    task_weights = tf.split(1, self.n_tasks, self.weight_placeholder)
    for task in range(self.n_tasks):
      task_label_vector = task_labels[task]
      task_weight_vector = task_weights[task]
      task_loss = loss_fn(outputs[task],
                          tf.squeeze(task_label_vector),
                          tf.squeeze(task_weight_vector))
      task_losses.append(task_loss)
    # It's ok to divide by just the batch_size rather than the number of nonzero
    # examples (effect averages out)
    total_loss = tf.add_n(task_losses)
    total_loss = tf.div(total_loss, self.batch_size)
    return total_loss

  def fit(self,
          dataset,
          nb_epoch=10,
          max_checkpoints_to_keep=5,
          log_every_N_batches=50,
          checkpoint_interval=10,
          **kwargs):
    # Perform the optimization
    log("Training for %d epochs" % nb_epoch, self.verbose)

    # TODO(rbharath): Disabling saving for now to try to debug.
    for epoch in range(nb_epoch):
      log("Starting epoch %d" % epoch, self.verbose)
      for batch_num, (X_b, y_b, w_b, ids_b) in enumerate(
          dataset.iterbatches(self.batch_size, pad_batches=self.pad_batches)):
        if batch_num % log_every_N_batches == 0:
          log("On batch %d" % batch_num, self.verbose)
        self.sess.run(
            self.train_op, feed_dict=self.construct_feed_dict(X_b, y_b, w_b))

  def save(self):
    """
    No-op since this model doesn't currently support saving... 
    """
    pass

  def predict(self, dataset, transformers=[], **kwargs):
    """Wraps predict to set batch_size/padding."""
    return super(MultitaskGraphRegressor, self).predict(
        dataset, transformers, batch_size=self.batch_size)

  def predict_on_batch(self, X):
    """Return model output for the provided input.
    """
    if self.pad_batches:
      X = pad_features(self.batch_size, X)
    # run eval data through the model
    n_tasks = self.n_tasks
    with self.sess.as_default():
      feed_dict = self.construct_feed_dict(X)
      # Shape (n_samples, n_tasks)
      batch_outputs = self.sess.run(self.outputs, feed_dict=feed_dict)

    n_samples = len(X)
    outputs = np.zeros((n_samples, self.n_tasks))
    for task, output in enumerate(batch_outputs):
      outputs[:, task] = output
    return outputs

  def get_num_tasks(self):
    """Needed to use Model.predict() from superclass."""
    return self.n_tasks
+61 −1
Original line number Diff line number Diff line
@@ -11,7 +11,7 @@ __license__ = "GPL"

import tensorflow as tf
from deepchem.nn.layers import GraphGather
from deepchem.models.tf_new_models.graph_topology import GraphTopology
from deepchem.models.tf_new_models.graph_topology import GraphTopology, DTNNGraphTopology


class SequentialGraph(object):
@@ -79,6 +79,66 @@ class SequentialGraph(object):
  def get_layer(self, layer_id):
    return self.layers[layer_id]

class SequentialDTNNGraph(object):
  """An analog of Keras Sequential class for Graph data.

  Like the Sequential class from Keras, but automatically passes topology
  placeholders from GraphTopology to each graph layer (from layers) added
  to the network. Non graph layers don't get the extra placeholders. 
  """

  def __init__(self, max_n_atoms, n_distance):
    """
    Parameters
    ----------
    n_feat: int
      Number of features per atom.
    """
    self.graph = tf.Graph()
    with self.graph.as_default():
      self.graph_DTNN_topology = DTNNGraphTopology(max_n_atoms, n_distance)
      self.output = self.graph_topology.get_atom_number_placeholder()
    # Keep track of the layers
    self.layers = []

  def add(self, layer):
    """Adds a new layer to model."""
    with self.graph.as_default():
      ############################################# DEBUG
      #print("start - add()")
      #print("self.output")
      #print(self.output)
      ############################################# DEBUG
      # For graphical layers, add connectivity placeholders 
      if type(layer).__name__ in ['DTNNStep']:
        self.output = layer([self.output] +
                            self.graph_DTNN_topology.get_topology_placeholders())
      else:
        self.output = layer(self.output)
      ############################################# DEBUG
      #print("end- add()")
      #print("self.output")
      #print(self.output)
      ############################################# DEBUG

      # Add layer to the layer list
      self.layers.append(layer)

  def get_graph_topology(self):
    return self.graph_topology

  def get_num_output_features(self):
    """Gets the output shape of the featurization layers of the network"""
    return self.layers[-1].output_shape[1]

  def return_outputs(self):
    return self.output

  def return_inputs(self):
    return self.graph_topology.get_input_placeholders()

  def get_layer(self, layer_id):
    return self.layers[layer_id]

class SequentialSupportGraph(object):
  """An analog of Keras Sequential model for test/support models."""
+115 −0
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@ __author__ = "Han Altae-Tran and Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "GPL"

import numpy as np
import tensorflow as tf
from deepchem.nn.copy import Input
from deepchem.feat.mol_graphs import ConvMol
@@ -139,3 +140,117 @@ class GraphTopology(object):
        self.membership_placeholder: batch.membership
    }
    return merge_dicts([atoms_dict, deg_adj_dict])

class DTNNGraphTopology(object):
  """Manages placeholders associated with batch of graphs and their topology"""

  def __init__(self, max_n_atoms, n_distance, name='DTNN_topology'):
    """
    Note that batch size is not specified in a GraphTopology object. A batch
    of molecules must be combined into a disconnected graph and fed to topology
    directly to handle batches.

    Parameters
    ----------
    n_feat: int
      Number of features per atom.
    name: str, optional
      Name of this manager.
    max_deg: int, optional
      Maximum #bonds for atoms in molecules.
    min_deg: int, optional
      Minimum #bonds for atoms in molecules.
    """

    #self.n_atoms = n_atoms
    self.name = name
    self.max_n_atoms = max_n_atoms
    self.n_distance = n_distance
    
    self.atom_number_placeholder = tf.placeholder(
        dtype='int32', 
        shape=(None,self.max_n_atoms), 
        name=self.name + '_atom_number')
    self.distance_matrix_placeholder = tf.placeholder(
        dtype='float32',
        shape=(None, self.max_n_atoms, self.max_n_atoms, self.n_distance),
        name=self.name + '_distance_matrix')
    self.distance_matrix_mask_placeholder = tf.placeholder(
        dtype=tf.float32,
        shape=(None, self.max_n_atoms, self.max_n_atoms),
        name=self.name + '_distance_matrix_mask')

    # Define the list of tensors to be used as topology
    self.topology = [self.distance_matrix_placeholder, 
                     self.distance_matrix_mask_placeholder]
    self.inputs = [self.atom_number_placeholder]
    self.inputs += self.topology

  def get_input_placeholders(self):
    """All placeholders.

    Contains atom_features placeholder and topology placeholders.
    """
    return self.inputs

  def get_topology_placeholders(self):
    """Returns topology placeholders

    Consists of deg_slice_placeholder, membership_placeholder, and the
    deg_adj_list_placeholders.
    """
    return self.topology

  def get_atom_number_placeholder(self):
    return self.atom_number_placeholder

  def get_distance_matrix_placeholder(self):
    return self.distance_matrix_placeholder

  def batch_to_feed_dict(self, batch):
    """Converts the current batch of mol_graphs into tensorflow feed_dict.

    Assigns the graph information in array of ConvMol objects to the
    placeholders tensors

    params
    ------
    batch : np.ndarray
      Array of ConvMol objects

    returns
    -------
    feed_dict : dict
      Can be merged with other feed_dicts for input into tensorflow
    """
    # Merge mol conv objects
    atom_number = np.asarray(map(np.diag, batch))
    atom_number = np.asarray(np.round(np.power(2*atom_number, 1/2.4)), dtype=int)
    ZiZj = []
    for molecule in atom_number:
        ZiZj.append(np.outer(molecule, molecule))
    ZiZj = np.asarray(ZiZj)
    distance_matrix = batch[:]
    for im, molecule in enumerate(batch):
      for ir, row in enumerate(molecule):
        for ie, element in enumerate(row):
          if element>0 and ir != ie:
            distance_matrix[im, ir, ie] = ZiZj[im, ir,ie]/element
          else:
            distance_matrix[im, ir, ie] = 0
    distance_matrix_mask = np.asarray(np.asarray(distance_matrix, dtype=bool), dtype=float)
    distance_matrix = self.gauss_expand(distance_matrix)
    # Generate dicts
    dict_DTNN = {
        self.atom_number_placeholder: atom_number,
        self.distance_matrix_placeholder: distance_matrix,
        self.distance_matrix_mask_placeholder: distance_matrix_mask
    }
    return dict_DTNN
    
  @staticmethod
  def gauss_expand(distance_matrix):
    for im, molecule in enumerate(distance_matrix):
      for ir, row in enumerate(molecule):
        for ie, element in enumerate(row):
          if element>0
 No newline at end of file
+130 −0
Original line number Diff line number Diff line
@@ -819,3 +819,133 @@ class LSTMStep(Layer):
    #return o, [h, c]
    return h, [h, c]
    ####################################################### DEBUG

class DTNNembedding(Layer):

  def __init__(self, 
               n_features=20, 
               periodic_table_length=83, 
               init='glorot_uniform',
               **kwargs):
    self.n_features = n_features
    self.periodic_table_length = periodic_table_length
    self.init = initializations.get(init)  # Set weight initialization

    super(GraphPool, self).__init__(**kwargs)

  def build(self, input_shape):
      
    self.embedding_list = self.init([self.periodic_table_length, self.n_features])
    self.trainable_weights = [self.embedding_list]

  def call(self, x):
    """Execute this layer on input tensors.

    Parameters
    ----------
    x: Tensor 
      1D tensor of length n_atoms (atomic number)

    Returns
    -------
    tf.Tensor
      Of shape (n_atoms, n_feat), where n_feat is number of atom_features
    """
    atom_features = tf.nn.embedding_lookup(self.embedding_list, x)
    return atom_features
    
class DTNNStep(Layer):

  def __init__(self, 
               n_features,
               n_distance,
               n_hidden=20,
               init='glorot_uniform',
               activation='tanh',
               **kwargs):
    self.n_features = n_features
    self.n_distance = n_distance
    self.n_hidden = n_hidden
    self.init = initializations.get(init)  # Set weight initialization
    self.activation = activations.get(activation)  # Get activations
        
    super(GraphPool, self).__init__(**kwargs)

  def build(self, input_shape):
    self.W_cf = self.init([self.n_features, self.n_hidden])
    self.W_df = self.init([self.n_distance, self.n_hidden])
    self.W_fc = self.init([self.n_hidden, self.n_features])
    self.b_cf = model_ops.zeros(shape=[self.n_hidden,])
    self.b_df = model_ops.zeros(shape=[self.n_hidden,])
    
    self.trainable_weights = [self.W_cf, self.W_df, self.W_fc, 
                              self.b_cf, self.b_df]

  def call(self, x):
    """Execute this layer on input tensors.

    Parameters
    ----------
    x: Tensor 
      1D tensor of length n_atoms (atomic number)

    Returns
    -------
    tf.Tensor
      Of shape (n_atoms, n_feat), where n_feat is number of atom_features
    """
    atom_features = x[0]
    distance_matrix = x[1]
    distance_matrix_mask = x[2]
    outputs = tf.multiply((tf.tensordot(distance_matrix, self.W_df, [[3], [0]]) + self.b_df),
        tf.expand_dims(tf.tensordot(atom_features, self.W_cf, [[2], [0]]) + self.b_cf, axis=1))
    outputs = tf.tensordot(outputs, self.W_fc, [[3], [0]])
    outputs = tf.multiply(outputs, tf.expand_dims(distance_matrix_mask, axis=3))
    outputs = self.activation(outputs)
    outputs = tf.reduce_sum(outputs, axis=2) + atom_features

    return outputs
    
class DTNNGather(Layer):

  def __init__(self, 
               n_features,
               n_hidden=20,
               init='glorot_uniform',
               activation='tanh',
               **kwargs):
    self.n_features = n_features
    self.n_hidden = n_hidden
    self.init = initializations.get(init)  # Set weight initialization
    self.activation = activations.get(activation)  # Get activations
        
    super(GraphPool, self).__init__(**kwargs)

  def build(self, input_shape):
    self.W_out1 = self.init([self.n_features, self.n_hidden])
    self.W_out2 = self.init([self.n_hidden, 1])
    self.b_out1 = model_ops.zeros(shape=[self.n_hidden,])
    self.b_out2 = model_ops.zeros(shape=[1,])
    
    self.trainable_weights = [self.W_out1, self.W_out2, 
                              self.b_out1, self.b_out2]

  def call(self, x):
    """Execute this layer on input tensors.

    Parameters
    ----------
    x: Tensor 
      1D tensor of length n_atoms (atomic number)

    Returns
    -------
    tf.Tensor
      Of shape (n_atoms, n_feat), where n_feat is number of atom_features
    """
    outputs = tf.tensordot(x, self.W_out1, [[2], [0]]) + self.b_out1
    outputs = self.activation(outputs)
    outputs = tf.tensordot(outputs, self.W_out2, [[2], [0]]) + self.b_out2
    outputs = tf.reduce_sum(tf.squeeze(outputs, axis=2), axis=1)

    return outputs
 No newline at end of file