Commit c7ae000b authored by miaecle's avatar miaecle
Browse files

docs change

parent bba48cdf
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -13,6 +13,8 @@ class DTNNGraphRegressor(MultitaskGraphRegressor):

    feat = self.model.return_outputs()
    feat_size = self.feat_dim
    # dimension of `feat` becomes Unknown after tf.tensordot operation
    # need to define dimension of W and b explicitly
    outputs = []
    W_list = []
    b_list = []
+63 −59
Original line number Diff line number Diff line
@@ -324,7 +324,6 @@ class DAGGraphTopology(GraphTopology):
    feed_dict : dict
      Can be merged with other feed_dicts for input into tensorflow
    """
    # Merge mol conv objects
    
    atoms_per_mol = [mol.get_num_atoms() for mol in batch]
    n_atom_features = batch[0].get_atom_features().shape[1]
@@ -336,41 +335,42 @@ class DAGGraphTopology(GraphTopology):
        axis=0)

    atoms_all = []
    parents_all = []
    # calculation orders for a batch of molecules
    parents_all = []
    calculation_orders = []
    for idm, mol in enumerate(batch):
      # padding atom features vector of each molecule with 0
      atom_features_padded = np.concatenate(
          [
              mol.get_atom_features(), np.zeros(
                  (self.max_atoms - atoms_per_mol[idm], n_atom_features))
          ],
          axis=0)
      # padding atom features vector of each molecule with 0
      atoms_all.append(atom_features_padded)
      
      parents = self.UG_to_DAG(mol)
      # calculation orders for DAGs
      assert len(parents) == atoms_per_mol[idm]
      parents = self.UG_to_DAG(mol)
      # number of DAGs should equal number of atoms
      assert len(parents) == atoms_per_mol[idm]
      parents_all.extend(parents[:])
      # padding with `max_atoms`
      parents_all.extend([
          self.max_atoms * np.ones((self.max_atoms, self.max_atoms), dtype=int)
          for i in range(self.max_atoms - atoms_per_mol[idm])
      ])
      # padding with max_atoms
      for parent in parents:
        calculation_orders.append(self.indice_changing(parent[:, 0], idm))
        # the indice for a specific atom in the molecule's DAGs and atom_features_placeholder
        # is different, this function changes the indice from the position in current molecule(DAGs) 
        # to position in batch of molecules(atom_features_placeholder)
        # and this is only going to be used in tf.gather on atom_features_placeholder
        # index for an atom in `parents_all` and `atoms_all` is different, 
        # this function changes the index from the position in current molecule(DAGs, `parents_all`) 
        # to position in batch of molecules(`atoms_all`)
        # only used in tf.gather on `atom_features_placeholder`
        calculation_orders.append(self.index_changing(parent[:, 0], idm))
      
      # padding with `batch_size*max_atoms`
      calculation_orders.extend([
          self.batch_size * self.max_atoms * np.ones(
              (self.max_atoms,), dtype=int)
          for i in range(self.max_atoms - atoms_per_mol[idm])
      ])
      # padding with (batch_size*max_atoms)
      
    atoms_all = np.concatenate(atoms_all, axis=0)
    parents_all = np.stack(parents_all, axis=0)
@@ -384,9 +384,9 @@ class DAGGraphTopology(GraphTopology):

    return atoms_dict

  def indice_changing(self, indice, n_mol):
    output = np.zeros_like(indice)
    for ide, element in enumerate(indice):
  def index_changing(self, index, n_mol):
    output = np.zeros_like(index)
    for ide, element in enumerate(index):
      if element < self.max_atoms:
        output[ide] = element + n_mol * self.max_atoms
      else:
@@ -396,82 +396,86 @@ class DAGGraphTopology(GraphTopology):
  def UG_to_DAG(self, sample):
    """This function generates the DAGs for a molecule
    """
    # list of calculation orders for DAGs
    # stemming from one specific atom in the molecule
    parents = []
    # list of DAGs, one DAG represents the calculation orders
    # stemming from one specific atom in the molecule,
    # hence this list include k elements for a molecule with k atoms
    UG = sample.get_adjacency_list()
    # starting from the adjacency list derived by graphconv featurizer
    UG = sample.get_adjacency_list()
    # number of atoms, also number of DAGs
    n_atoms = sample.get_num_atoms()
    # number of graphs need to be generated
    # DAG on a molecule with k atoms includes k steps of calculation, 
    # each step calculating graph features for one atom.
    # `max_atoms` is the maximum number of steps
    max_atoms = self.max_atoms
    # for a graph on a molecule with k atoms, there will be k steps, 
    # each step calculate graph features for one atom,
    # maximum number of steps is the same as max_atoms
    for count in range(n_atoms):
      # each iteration generates one DAG
      # stemming from atom with indice `count`
      # each iteration generates the DAG starting from atom with index `count`
      DAG = []
      # list of lists, elements represent the calculation orders
      # for atoms in the current graph
      parent = [[] for i in range(n_atoms)]
      # list of lists, each element(also a list) represents the calculation order
      # for every atom in the molecule in the current graph
      # starting from the target atom with index `count`
      current_atoms = [count]
      # starting from the atom with indice `count`
      # flags of whether the atom is already included in the DAG
      atoms_indicator = np.ones((n_atoms,))
      # flags, whether the atom is already included in the DAG
      atoms_indicator[count] = 0
      # atom `count` is in the DAG
      radial = 0
      atoms_indicator[count] = 0
      # recording number of radial propagation steps
      radial = 0
      while np.sum(atoms_indicator) > 0:
        # in this while loop, atoms directly connected to `count` will be first added into
        # the DAG(radial=0), then atoms two-bond away from `count` will be added in the
        # second loop(radial=1). Atoms i-bond away will be added in loop i
        # in the fisrt loop, atoms directly connected to `count` will be added 
        # into the DAG(radial=0), then atoms two-bond away from `count` 
        # will be added in the second loop(radial=1). 
        # atoms i-bond away will be added in i-th loop
        if radial > n_atoms:
          # when molecules have separate parts, starting from one part,
          # it is not possible to include all atoms.
          # this break quit the loop when going into such condition
          break
          # when molecules have separate parts, starting from one part, it is not possible
          # to include all atoms.
        next_atoms = []
        # reinitialize targets for next iteration
        next_atoms = []
        for current_atom in current_atoms:
          for atom_adj in UG[current_atom]:
            # atoms connected to current_atom
            if atoms_indicator[atom_adj] > 0:
              # generate the dependency map of current DAG
              # atoms connected to `current_atoms`(and not included in the DAG)
              # are added, and will be the `current_atoms` for next iteration.
              DAG.append((current_atom, atom_adj))
              # this for loop generates the dependency map of this DAG
              # atoms that connected to current_atoms(and not included in the DAG yet)
              # are added into DAG, and will be the current_atoms for next iteration.
              atoms_indicator[atom_adj] = 0
              next_atoms.append(atom_adj)
              # including into targets for next iteration
        current_atoms = next_atoms
        # into next iteration, finding atoms connected one more bond away
        radial = radial + 1
      # DAG starts from the target atom, calculation should go in reverse
      for edge in reversed(DAG):
        # DAG starts from the target atom, hence the calculation should go in reverse
        # `edge[1]` is the parent of `edge[0]`
        parent[edge[0]].append(edge[1])
        # edge[1] is the parent of edge[0]
        # all the parents of `edge[1]` is also the parents of `edge[0]`
        parent[edge[0]].extend(parent[edge[1]])
        # all the parents of edge[1] is also the parents of edge[0]
      # after this for loop, parents[i] is the list that includes all parents of atom i
      # after this loop, `parents[i]` includes all parents of atom i
      
      for ids, atom in enumerate(parent):
        # manually adding the atom index into its parents list
        parent[ids].insert(0, ids)
        # manually adding the atom indice into its parents list
      # after this for loop, parents[i][0] = i, parents[i][1:] are all parents of atom i
      # after this loop, `parents[i][0]` is i, `parents[i][1:]` are all parents of atom i
      
      # atoms with less parents(farther from the target atom) come first.
      # graph features of atoms without parents will be first calculated,
      # then atoms with more parents can be calculated in order 
      # based on previously calculated graph features.
      # target atom of this DAG will be calculated in the last step
      parent = sorted(parent, key=len)
      # key part of this function, atoms with less parents come first,
      # so when we do a for loop on the list , atoms without parents will be first calculated
      # then atoms with more parents can be calculated based on calculated graph features.
      # the starting atom of this DAG will be calculated in the last step, 
      # since every other atom is its parent.
      
      for ids, atom in enumerate(parent):
        n_par = len(atom)
        # padding with `max_atoms`
        parent[ids].extend([max_atoms for i in range(max_atoms - n_par)])
        # padding with max_atoms
        
      while len(parent) < max_atoms:
        parent.insert(0, [max_atoms] * max_atoms)
        # padding
        parent.insert(0, [max_atoms] * max_atoms)
      # `parents[i]` is the calculation order for the DAG stemming from atom i,
      # which is a max_atoms * max_atoms numpy array after padding
      parents.append(np.array(parent))
      # parents[i] is the calculation order for the DAG stemming from atom i,
      # which is a max_atoms * max_atoms numpy array(after padding)
      
    return parents
+43 −38
Original line number Diff line number Diff line
@@ -1082,79 +1082,84 @@ class DAGLayer(Layer):
    self.build()

    # Extract atom_features
    atom_features = x[0]
    # Basic features of every atom: (batch_size*max_atoms) * n_atom_features
    atom_features = x[0]
    
    # calculation orders of graph: (batch_size*max_atoms) * max_atoms * max_atoms
    # each atom corresponds to a graph, which is represented by the `max_atoms*max_atoms` int32 matrix of index
    # each gragh include `max_atoms` of steps(corresponding to rows) of calculating graph features
    # step i calculates the graph features for atoms of index `parents[:,i,0]`
    parents = x[1]
    # Structure of graph: (batch_size*max_atoms) * max_atoms * max_atoms
    # each atom corresponds to a graph, which is represented by the max_atoms * max_atoms int32 matrix of indices
    # there are in total max_atoms number of steps(corresponding to rows) in calculating the graph outputs
    # in step i, we calculate the graph features of atom(i,0)
    # from inputs: atom features of atom(i,0), graph_features of this atom's parents in the graph(atom(i,1) through atom(i,max_atoms))
    # if number of parents is less than max_atoms-1, padded it with max_atoms, representing a dummy with all zeros)
    calculation_orders = x[2]
    # (batch_size*max_atoms) * max_atoms
    # indices of atom(i,0)
    # represent the same atoms of parents[:, :, 0], different in that the indices are for atom_features(0~max_atoms*batch_size)
    
    # target atoms for each step: (batch_size*max_atoms) * max_atoms
    # represent the same atoms of `parents[:, :, 0]`, 
    # different in that these index are positions in `atom_features`
    # paded with max_atoms*batch_size
    calculation_orders = x[2]
    # flags: (batch_size*max_atoms)
    # 0 for paddings, 1 for real atoms
    membership = x[3]
    # (batch_size*max_atoms)
    # 0 for dummy atoms, 1 for real atoms
    # number of atoms in total, should equal `batch_size*max_atoms`
    n_atoms = atom_features.get_shape()[0]
    # number of atoms in total, =batch_size
    
    # initialize graph features for each graph
    # another row of zeros is generated for padded dummy atoms
    graph_features = tf.Variable(
        tf.constant(0., shape=(n_atoms, self.max_atoms + 1, self.n_graph_feat)),
        trainable=False)
    # Initialize graph features for atoms in the molecule for each graph
    # for each graph, another row of zeros is generated as the dummy
    # add dummy
    atom_features = tf.concat(
        axis=0,
        values=[
            atom_features, tf.constant(0., shape=(1, self.n_atom_features))
        ])
    # dummy
    for count in range(self.max_atoms):
      # count-th step
      # `count`-th step
      # extracting atom features of target atoms: (batch_size*max_atoms) * n_atom_features
      batch_atom_features = tf.gather(atom_features,
                                      calculation_orders[:, count])
      # extracting atom features of target atoms, shape: (batch_size*max_atoms) * n_atom_features
      
      indice = tf.stack(
      # generating index for graph features used in the inputs
      index = tf.stack(
          [
              tf.reshape(
                  tf.stack([tf.range(n_atoms)] * (self.max_atoms - 1), axis=1),
                  [-1]), tf.reshape(parents[:, count, 1:], [-1])
          ],
          axis=1)
      # generating indices for graph features used in the inputs
      # extracting graph features for parents of the target atoms, then flatten
      # shape: (batch_size*max_atoms) * [(max_atoms-1)*n_graph_features]
      batch_graph_features = tf.reshape(
          tf.gather_nd(graph_features, indice),
          tf.gather_nd(graph_features, index),
          [-1, (self.max_atoms - 1) * self.n_graph_feat])
      # extracting graph features of the parents of the target atoms, then flatten
      # shape: (batch_size*max_atoms) * [(max_atoms-1)*n_graph_features]
      
      # concat into the input tensor: (batch_size*max_atoms) * n_inputs
      batch_inputs = tf.concat(
          axis=1, values=[batch_atom_features, batch_graph_features])
      # concat into the input tensor, shape: (batch_size*max_atoms) * n_inputs
      # DAGgraph_step maps from batch_inputs to a batch of graph_features
      # of shape: (batch_size*max_atoms) * n_graph_features
      # representing the graph features of target atoms in each graph
      batch_outputs = self.DAGgraph_step(batch_inputs, self.W_list, self.b_list)
      # DAGgraph_step mapping from batch_inputs to a batch of graph_features
      # shape: (batch_size*max_atoms) * n_graph_features
      # representing the graph features of the target atoms in each graph
      target_indices = tf.stack(
      
      # index for targe atoms
      target_index = tf.stack(
          [tf.range(n_atoms), parents[:, count, 0]], axis=1)
      target_indices2 = tf.stack(
      # index for dummies
      target_index2 = tf.stack(
          [tf.range(n_atoms), tf.constant(self.max_atoms, shape=(n_atoms,))],
          axis=1)
      graph_features = tf.scatter_nd_update(graph_features, target_indices,
                                            batch_outputs)
      # update the graph features for target atoms
      graph_features = tf.scatter_nd_update(graph_features, target_indices2,
      graph_features = tf.scatter_nd_update(graph_features, target_index,
                                            batch_outputs)
      # recover dummies to zeros if being updated
      graph_features = tf.scatter_nd_update(graph_features, target_index2,
                                            tf.zeros(
                                                (n_atoms, self.n_graph_feat)))
      # recover dummies to zeros if being updated
      
    # last step generates graph features for all target atoms
    # masking the outputs
    outputs = tf.multiply(batch_outputs,
                          tf.expand_dims(tf.to_float(membership), axis=1))
    # masking the outputs of the last step
    return outputs

  def DAGgraph_step(self, batch_inputs, W_list, b_list):
+1 −1
Original line number Diff line number Diff line
"""
Script that trains graph-conv models on Tox21 dataset.
Script that trains DAG models on delaney dataset.
"""
from __future__ import print_function
from __future__ import division
+1 −1
Original line number Diff line number Diff line
"""
Script that trains graph-conv models on Tox21 dataset.
Script that trains DAG models on Tox21 dataset.
"""
from __future__ import print_function
from __future__ import division