Commit bba48cdf authored by miaecle's avatar miaecle
Browse files

add docs

parent 06ecfb87
Loading
Loading
Loading
Loading
+53 −9
Original line number Diff line number Diff line
@@ -337,6 +337,7 @@ class DAGGraphTopology(GraphTopology):

    atoms_all = []
    parents_all = []
    # calculation orders for a batch of molecules
    calculation_orders = []
    for idm, mol in enumerate(batch):
      atom_features_padded = np.concatenate(
@@ -349,8 +350,9 @@ class DAGGraphTopology(GraphTopology):
      atoms_all.append(atom_features_padded)

      parents = self.UG_to_DAG(mol)
      # ConvMol objects input here should have gone through the DAG Transformer
      # calculation orders for DAGs
      assert len(parents) == atoms_per_mol[idm]
      # number of DAGs should equal number of atoms
      parents_all.extend(parents[:])
      parents_all.extend([
          self.max_atoms * np.ones((self.max_atoms, self.max_atoms), dtype=int)
@@ -359,13 +361,16 @@ class DAGGraphTopology(GraphTopology):
      # padding with max_atoms
      for parent in parents:
        calculation_orders.append(self.indice_changing(parent[:, 0], idm))
        # change the indice from current molecule to batch of molecules
        # the indice for a specific atom in the molecule's DAGs and atom_features_placeholder
        # is different, this function changes the indice from the position in current molecule(DAGs) 
        # to position in batch of molecules(atom_features_placeholder)
        # and this is only going to be used in tf.gather on atom_features_placeholder
      calculation_orders.extend([
          self.batch_size * self.max_atoms * np.ones(
              (self.max_atoms,), dtype=int)
          for i in range(self.max_atoms - atoms_per_mol[idm])
      ])
      # padding with batch_size * max_atoms
      # padding with (batch_size*max_atoms)

    atoms_all = np.concatenate(atoms_all, axis=0)
    parents_all = np.stack(parents_all, axis=0)
@@ -389,45 +394,84 @@ class DAGGraphTopology(GraphTopology):
    return output

  def UG_to_DAG(self, sample):
    """This function generates the DAGs for a molecule
    """
    parents = []
    # list of DAGs, one DAG represents the calculation orders
    # stemming from one specific atom in the molecule,
    # hence this list include k elements for a molecule with k atoms
    UG = sample.get_adjacency_list()
    # starting from the adjacency list derived by graphconv featurizer
    n_atoms = sample.get_num_atoms()
    # number of graphs need to be generated
    max_atoms = self.max_atoms
    # for a graph on a molecule with k atoms, there will be k steps, 
    # each step calculate graph features for one atom,
    # maximum number of steps is the same as max_atoms
    for count in range(n_atoms):
      # each iteration generates one DAG
      # stemming from atom with indice `count`
      DAG = []
      parent = [[] for i in range(n_atoms)]
      # list of lists, each element(also a list) represents the calculation order
      # for every atom in the molecule in the current graph
      current_atoms = [count]
      # first element is current atom
      # starting from the atom with indice `count`
      atoms_indicator = np.ones((n_atoms,))
      # if is been included in the graph
      # flags, whether the atom is already included in the DAG
      atoms_indicator[count] = 0
      # atom `count` is in the DAG
      radial = 0
      # recording number of radial propagation steps
      while np.sum(atoms_indicator) > 0:
        # in this while loop, atoms directly connected to `count` will be first added into
        # the DAG(radial=0), then atoms two-bond away from `count` will be added in the
        # second loop(radial=1). Atoms i-bond away will be added in loop i
        if radial > n_atoms:
          break  # molecules with two separate ions may stuck here
          break
          # when molecules have separate parts, starting from one part, it is not possible
          # to include all atoms.
        next_atoms = []
        # reinitialize targets for next iteration
        for current_atom in current_atoms:
          for atom_adj in UG[current_atom]:
            # atoms connected to current_atom
            if atoms_indicator[atom_adj] > 0:
              DAG.append((current_atom, atom_adj))
              # this for loop generates the dependency map of this DAG
              # atoms that connected to current_atoms(and not included in the DAG yet)
              # are added into DAG, and will be the current_atoms for next iteration.
              atoms_indicator[atom_adj] = 0
              # tagging for included atoms
              next_atoms.append(atom_adj)
              # including into targets for next iteration
        current_atoms = next_atoms
        # into next step, finding atoms connected with one more bond
        # into next iteration, finding atoms connected one more bond away
        radial = radial + 1
      for edge in reversed(DAG):
        # DAG starts from the target atom, hence the calculation should go in reverse
        parent[edge[0]].append(edge[1])
        # edge[1] is the parent of edge[0]
        parent[edge[0]].extend(parent[edge[1]])
        # adding parents
        # all the parents of edge[1] is also the parents of edge[0]
      # after this for loop, parents[i] is the list that includes all parents of atom i
      for ids, atom in enumerate(parent):
        parent[ids].insert(0, ids)
        # manually adding the atom indice into its parents list
      # after this for loop, parents[i][0] = i, parents[i][1:] are all parents of atom i
      parent = sorted(parent, key=len)
      # key part of this function, atoms with less parents come first,
      # so when we do a for loop on the list , atoms without parents will be first calculated
      # then atoms with more parents can be calculated based on calculated graph features.
      # the starting atom of this DAG will be calculated in the last step, 
      # since every other atom is its parent.
      for ids, atom in enumerate(parent):
        n_par = len(atom)
        parent[ids].extend([max_atoms for i in range(max_atoms - n_par)])
        # padding with max_atoms
      while len(parent) < max_atoms:
        parent.insert(0, [max_atoms] * max_atoms)
        # padding
      parents.append(np.array(parent))
      # parents[i] is the calculation order for the DAG stemming from atom i,
      # which is a max_atoms * max_atoms numpy array(after padding)
    return parents
+7 −6
Original line number Diff line number Diff line
@@ -23,19 +23,20 @@ metric = [

# Batch size of models
batch_size = 50
n_embedding = 20
graph_model = dc.nn.SequentialDTNNGraph(max_n_atoms=23, n_distance=100)
graph_model.add(dc.nn.DTNNEmbedding(n_embedding=20))
graph_model.add(dc.nn.DTNNStep(n_embedding=20, n_distance=100))
graph_model.add(dc.nn.DTNNStep(n_embedding=20, n_distance=100))
graph_model.add(dc.nn.DTNNGather(n_embedding=20))
n_feat = 20
graph_model.add(dc.nn.DTNNEmbedding(n_embedding=n_embedding))
graph_model.add(dc.nn.DTNNStep(n_embedding=n_embedding, n_distance=100))
graph_model.add(dc.nn.DTNNStep(n_embedding=n_embedding, n_distance=100))
graph_model.add(dc.nn.DTNNGather(n_embedding=n_embedding))
n_feat = n_embedding

model = dc.models.DTNNGraphRegressor(
    graph_model,
    len(tasks),
    n_feat,
    batch_size=batch_size,
    learning_rate=1e-3,
    learning_rate=0.001,
    learning_rate_decay_time=1000,
    optimizer_type="adam",
    beta1=.9,