Commit 5c5a918c authored by miaecle's avatar miaecle
Browse files

DAGTransformer rebuild

parent 12487c87
Loading
Loading
Loading
Loading
+5 −6
Original line number Diff line number Diff line
@@ -499,9 +499,9 @@ class DiskDataset(Dataset):
      w_next = np.zeros((0,) + (len(tasks),))
      ids_next = np.zeros((0,), dtype=object)
      for (X, y, w, ids) in self.itershards():
        X_next = np.vstack([X_next, X])
        y_next = np.vstack([y_next, y])
        w_next = np.vstack([w_next, w])
        X_next = np.concatenate([X_next, X], axis=0)
        y_next = np.concatenate([y_next, y], axis=0)
        w_next = np.concatenate([w_next, w], axis=0)
        ids_next = np.concatenate([ids_next, ids])
        while len(X_next) > shard_size:
          X_batch, X_next = X_next[:shard_size], X_next[shard_size:]
@@ -526,9 +526,8 @@ class DiskDataset(Dataset):
    if not len(self.metadata_df):
      raise ValueError("No data in dataset.")
    sample_X = load_from_disk(
        os.path.join(self.data_dir, next(self.metadata_df.iterrows())[1]['X']))[
            0]
    return np.shape(sample_X)
        os.path.join(self.data_dir, next(self.metadata_df.iterrows())[1]['X']))
    return np.shape(sample_X)[1:]

  def get_shard_size(self):
    """Gets size of shards on disk."""
+5 −92
Original line number Diff line number Diff line
@@ -349,7 +349,7 @@ class DAGGraphTopology(GraphTopology):
      atoms_all.append(atom_features_padded)

      # calculation orders for DAGs
      parents = self.UG_to_DAG(mol)
      parents = mol.parents
      # number of DAGs should equal number of atoms
      assert len(parents) == atoms_per_mol[idm]
      parents_all.extend(parents[:])
@@ -392,90 +392,3 @@ class DAGGraphTopology(GraphTopology):
      else:
        output[ide] = self.batch_size * self.max_atoms
    return output

  def UG_to_DAG(self, sample):
    """This function generates the DAGs for a molecule
    """
    # list of calculation orders for DAGs
    # stemming from one specific atom in the molecule
    parents = []
    # starting from the adjacency list derived by graphconv featurizer
    UG = sample.get_adjacency_list()
    # number of atoms, also number of DAGs
    n_atoms = sample.get_num_atoms()
    # DAG on a molecule with k atoms includes k steps of calculation, 
    # each step calculating graph features for one atom.
    # `max_atoms` is the maximum number of steps
    max_atoms = self.max_atoms
    for count in range(n_atoms):
      # each iteration generates the DAG starting from atom with index `count`
      DAG = []
      # list of lists, elements represent the calculation orders
      # for atoms in the current graph
      parent = [[] for i in range(n_atoms)]
      # starting from the target atom with index `count`
      current_atoms = [count]
      # flags of whether the atom is already included in the DAG
      atoms_indicator = np.ones((n_atoms,))
      # atom `count` is in the DAG
      atoms_indicator[count] = 0
      # recording number of radial propagation steps
      radial = 0
      while np.sum(atoms_indicator) > 0:
        # in the fisrt loop, atoms directly connected to `count` will be added 
        # into the DAG(radial=0), then atoms two-bond away from `count` 
        # will be added in the second loop(radial=1). 
        # atoms i-bond away will be added in i-th loop
        if radial > n_atoms:
          # when molecules have separate parts, starting from one part,
          # it is not possible to include all atoms.
          # this break quit the loop when going into such condition
          break
        # reinitialize targets for next iteration
        next_atoms = []
        for current_atom in current_atoms:
          for atom_adj in UG[current_atom]:
            # atoms connected to current_atom
            if atoms_indicator[atom_adj] > 0:
              # generate the dependency map of current DAG
              # atoms connected to `current_atoms`(and not included in the DAG)
              # are added, and will be the `current_atoms` for next iteration.
              DAG.append((current_atom, atom_adj))
              atoms_indicator[atom_adj] = 0
              next_atoms.append(atom_adj)
        current_atoms = next_atoms
        # into next iteration, finding atoms connected one more bond away
        radial = radial + 1
      # DAG starts from the target atom, calculation should go in reverse
      for edge in reversed(DAG):
        # `edge[1]` is the parent of `edge[0]`
        parent[edge[0]].append(edge[1])
        # all the parents of `edge[1]` is also the parents of `edge[0]`
        parent[edge[0]].extend(parent[edge[1]])
      # after this loop, `parents[i]` includes all parents of atom i
      
      for ids, atom in enumerate(parent):
        # manually adding the atom index into its parents list
        parent[ids].insert(0, ids)
      # after this loop, `parents[i][0]` is i, `parents[i][1:]` are all parents of atom i
      
      # atoms with less parents(farther from the target atom) come first.
      # graph features of atoms without parents will be first calculated,
      # then atoms with more parents can be calculated in order 
      # based on previously calculated graph features.
      # target atom of this DAG will be calculated in the last step
      parent = sorted(parent, key=len)
      
      for ids, atom in enumerate(parent):
        n_par = len(atom)
        # padding with `max_atoms`
        parent[ids].extend([max_atoms for i in range(max_atoms - n_par)])
        
      while len(parent) < max_atoms:
        # padding
        parent.insert(0, [max_atoms] * max_atoms)
      # `parents[i]` is the calculation order for the DAG stemming from atom i,
      # which is a max_atoms * max_atoms numpy array after padding
      parents.append(np.array(parent))
      
    return parents
+1 −0
Original line number Diff line number Diff line
@@ -15,3 +15,4 @@ from deepchem.trans.transformers import CDFTransformer
from deepchem.trans.transformers import PowerTransformer
from deepchem.trans.transformers import CoulombFitTransformer
from deepchem.trans.transformers import IRVTransformer
from deepchem.trans.transformers import DAGTransformer
 No newline at end of file
+120 −0
Original line number Diff line number Diff line
@@ -775,3 +775,123 @@ class IRVTransformer():
  def untransform(self, z):
    raise NotImplementedError(
        "Cannot untransform datasets with IRVTransformer.")


class DAGTransformer(Transformer):
  """Performs transform from ConvMol adjacency lists to 
  DAG calculation orders
  """

  def __init__(self,
               max_atoms=50,
               transform_X=True,
               transform_y=False,
               transform_w=False):
    """Initializes DAGTransformer.
    Only X can be transformed
    """
    self.max_atoms = max_atoms
    self.transform_X = transform_X
    self.transform_y = transform_y
    self.transform_w = transform_w
    assert self.transform_X
    assert not self.transform_y
    assert not self.transform_w

  def transform_array(self, X, y, w):
    """Add calculation orders to ConvMol objects"""
    if self.transform_X:
      for idm, mol in enumerate(X):
        X[idm].parents = self.UG_to_DAG(mol)
    return (X, y, w)

  def untransform(self, z):
    raise NotImplementedError(
        "Cannot untransform datasets with DAGTransformer.")

  def UG_to_DAG(self, sample):
    """This function generates the DAGs for a molecule
    """
    # list of calculation orders for DAGs
    # stemming from one specific atom in the molecule
    parents = []
    # starting from the adjacency list derived by graphconv featurizer
    UG = sample.get_adjacency_list()
    # number of atoms, also number of DAGs
    n_atoms = sample.get_num_atoms()
    # DAG on a molecule with k atoms includes k steps of calculation, 
    # each step calculating graph features for one atom.
    # `max_atoms` is the maximum number of steps
    max_atoms = self.max_atoms
    for count in range(n_atoms):
      # each iteration generates the DAG starting from atom with index `count`
      DAG = []
      # list of lists, elements represent the calculation orders
      # for atoms in the current graph
      parent = [[] for i in range(n_atoms)]
      # starting from the target atom with index `count`
      current_atoms = [count]
      # flags of whether the atom is already included in the DAG
      atoms_indicator = np.ones((n_atoms,))
      # atom `count` is in the DAG
      atoms_indicator[count] = 0
      # recording number of radial propagation steps
      radial = 0
      while np.sum(atoms_indicator) > 0:
        # in the fisrt loop, atoms directly connected to `count` will be added 
        # into the DAG(radial=0), then atoms two-bond away from `count` 
        # will be added in the second loop(radial=1). 
        # atoms i-bond away will be added in i-th loop
        if radial > n_atoms:
          # when molecules have separate parts, starting from one part,
          # it is not possible to include all atoms.
          # this break quit the loop when going into such condition
          break
        # reinitialize targets for next iteration
        next_atoms = []
        for current_atom in current_atoms:
          for atom_adj in UG[current_atom]:
            # atoms connected to current_atom
            if atoms_indicator[atom_adj] > 0:
              # generate the dependency map of current DAG
              # atoms connected to `current_atoms`(and not included in the DAG)
              # are added, and will be the `current_atoms` for next iteration.
              DAG.append((current_atom, atom_adj))
              atoms_indicator[atom_adj] = 0
              next_atoms.append(atom_adj)
        current_atoms = next_atoms
        # into next iteration, finding atoms connected one more bond away
        radial = radial + 1
      # DAG starts from the target atom, calculation should go in reverse
      for edge in reversed(DAG):
        # `edge[1]` is the parent of `edge[0]`
        parent[edge[0]].append(edge[1])
        # all the parents of `edge[1]` is also the parents of `edge[0]`
        parent[edge[0]].extend(parent[edge[1]])
      # after this loop, `parents[i]` includes all parents of atom i

      for ids, atom in enumerate(parent):
        # manually adding the atom index into its parents list
        parent[ids].insert(0, ids)
      # after this loop, `parents[i][0]` is i, `parents[i][1:]` are all parents of atom i

      # atoms with less parents(farther from the target atom) come first.
      # graph features of atoms without parents will be first calculated,
      # then atoms with more parents can be calculated in order 
      # based on previously calculated graph features.
      # target atom of this DAG will be calculated in the last step
      parent = sorted(parent, key=len)

      for ids, atom in enumerate(parent):
        n_par = len(atom)
        # padding with `max_atoms`
        parent[ids].extend([max_atoms for i in range(max_atoms - n_par)])

      while len(parent) < max_atoms:
        # padding
        parent.insert(0, [max_atoms] * max_atoms)
      # `parents[i]` is the calculation order for the DAG stemming from atom i,
      # which is a max_atoms * max_atoms numpy array after padding
      parents.append(np.array(parent))

    return parents
+16 −4
Original line number Diff line number Diff line
@@ -19,14 +19,26 @@ train_dataset, valid_dataset, test_dataset = delaney_datasets
# Fit models
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)

max_atoms_train = max([mol.get_num_atoms() for mol in train_dataset.X])
max_atoms_valid = max([mol.get_num_atoms() for mol in valid_dataset.X])
max_atoms_test = max([mol.get_num_atoms() for mol in test_dataset.X])
max_atoms = max([max_atoms_train, max_atoms_valid, max_atoms_test])

transformer = dc.trans.DAGTransformer(max_atoms=max_atoms)
train_dataset.reshard(512)
train_dataset = transformer.transform(train_dataset)
valid_dataset.reshard(512)
valid_dataset = transformer.transform(valid_dataset)
test_dataset.reshard(512)
test_dataset = transformer.transform(test_dataset)
# Do setup required for tf/keras models
# Number of features on conv-mols
n_feat = 75
# Batch size of models
batch_size = 32
graph = dc.nn.SequentialDAGGraph(75, batch_size=batch_size, max_atoms=55)
graph.add(dc.nn.DAGLayer(30, 75, max_atoms=55))
graph.add(dc.nn.DAGGather(max_atoms=55))
batch_size = 64
graph = dc.nn.SequentialDAGGraph(75, batch_size=batch_size, max_atoms=max_atoms)
graph.add(dc.nn.DAGLayer(30, 75, max_atoms=max_atoms))
graph.add(dc.nn.DAGGather(max_atoms=max_atoms))

model = dc.models.MultitaskGraphRegressor(
    graph,
Loading