Commit 7cafea66 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #495 from miaecle/DAG2

Directed acyclic graph models
parents d82fd707 5544a2e8
Loading
Loading
Loading
Loading
+5 −6
Original line number Diff line number Diff line
@@ -499,9 +499,9 @@ class DiskDataset(Dataset):
      w_next = np.zeros((0,) + (len(tasks),))
      ids_next = np.zeros((0,), dtype=object)
      for (X, y, w, ids) in self.itershards():
        X_next = np.vstack([X_next, X])
        y_next = np.vstack([y_next, y])
        w_next = np.vstack([w_next, w])
        X_next = np.concatenate([X_next, X], axis=0)
        y_next = np.concatenate([y_next, y], axis=0)
        w_next = np.concatenate([w_next, w], axis=0)
        ids_next = np.concatenate([ids_next, ids])
        while len(X_next) > shard_size:
          X_batch, X_next = X_next[:shard_size], X_next[shard_size:]
@@ -526,9 +526,8 @@ class DiskDataset(Dataset):
    if not len(self.metadata_df):
      raise ValueError("No data in dataset.")
    sample_X = load_from_disk(
        os.path.join(self.data_dir, next(self.metadata_df.iterrows())[1]['X']))[
            0]
    return np.shape(sample_X)
        os.path.join(self.data_dir, next(self.metadata_df.iterrows())[1]['X']))
    return np.shape(sample_X)[1:]

  def get_shard_size(self):
    """Gets size of shards on disk."""
+46 −0
Original line number Diff line number Diff line
@@ -706,6 +706,52 @@ class TestOverfit(test_util.TensorFlowTestCase):

    assert scores[regression_metric.name] > .9

  def test_DAG_singletask_regression_overfit(self):
    """Test DAG regressor multitask overfits tiny data."""
    np.random.seed(123)
    tf.set_random_seed(123)
    n_tasks = 1

    # Load mini log-solubility dataset.
    featurizer = dc.feat.ConvMolFeaturizer()
    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir, "example_regression.csv")
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(input_file)

    regression_metric = dc.metrics.Metric(
        dc.metrics.pearson_r2_score, task_averager=np.mean)

    n_feat = 75
    batch_size = 10
    transformer = dc.trans.DAGTransformer(max_atoms=50)
    dataset = transformer.transform(dataset)

    graph = dc.nn.SequentialDAGGraph(
        n_feat, batch_size=batch_size, max_atoms=50)
    graph.add(dc.nn.DAGLayer(30, n_feat, max_atoms=50))
    graph.add(dc.nn.DAGGather(max_atoms=50))

    model = dc.models.MultitaskGraphRegressor(
        graph,
        n_tasks,
        n_feat,
        batch_size=batch_size,
        learning_rate=0.005,
        learning_rate_decay_time=1000,
        optimizer_type="adam",
        beta1=.9,
        beta2=.999)

    # Fit trained model
    model.fit(dataset, nb_epoch=50)
    model.save()
    # Eval model on train
    scores = model.evaluate(dataset, [regression_metric])

    assert scores[regression_metric.name] > .8

  def test_siamese_singletask_classification_overfit(self):
    """Test siamese singletask model overfits tiny data."""
    np.random.seed(123)
+2 −0
Original line number Diff line number Diff line
@@ -13,6 +13,8 @@ class DTNNGraphRegressor(MultitaskGraphRegressor):

    feat = self.model.return_outputs()
    feat_size = self.feat_dim
    # dimension of `feat` becomes Unknown after tf.tensordot operation
    # need to define dimension of W and b explicitly
    outputs = []
    W_list = []
    b_list = []
+34 −1
Original line number Diff line number Diff line
@@ -11,7 +11,7 @@ __license__ = "MIT"

import tensorflow as tf
from deepchem.nn.layers import GraphGather
from deepchem.models.tf_new_models.graph_topology import GraphTopology, DTNNGraphTopology
from deepchem.models.tf_new_models.graph_topology import GraphTopology, DTNNGraphTopology, DAGGraphTopology


class SequentialGraph(object):
@@ -129,6 +129,39 @@ class SequentialDTNNGraph(SequentialGraph):
      self.layers.append(layer)


class SequentialDAGGraph(SequentialGraph):
  """SequentialGraph for DAG models
  """

  def __init__(self, n_feat, batch_size=50, max_atoms=50):
    """
    Parameters
    ----------
    n_feat: int
      Number of features per atom.
    batch_size: int, optional(default=50)
      Number of molecules in a batch
    max_atoms: int, optional(default=50)
      Maximum number of atoms in a molecule, should be defined based on dataset
    """
    self.graph = tf.Graph()
    with self.graph.as_default():
      self.graph_topology = DAGGraphTopology(
          n_feat, batch_size, max_atoms=max_atoms)
      self.output = self.graph_topology.get_atom_features_placeholder()
    self.layers = []

  def add(self, layer):
    """Adds a new layer to model."""
    with self.graph.as_default():
      if type(layer).__name__ in ['DAGLayer']:
        self.output = layer([self.output] +
                            self.graph_topology.get_topology_placeholders())
      else:
        self.output = layer(self.output)
      self.layers.append(layer)


class SequentialSupportGraph(object):
  """An analog of Keras Sequential model for test/support models."""

+134 −0
Original line number Diff line number Diff line
@@ -258,3 +258,137 @@ class DTNNGraphTopology(GraphTopology):
    steps = np.array([distance_min + i * step_size for i in range(n_distance)])
    distance_vector = np.exp(-np.square(distance - steps) / (2 * step_size**2))
    return distance_vector


class DAGGraphTopology(GraphTopology):
  """GraphTopology for DAG models
  """

  def __init__(self, n_feat, batch_size, name='topology', max_atoms=50):

    self.n_feat = n_feat
    self.name = name
    self.max_atoms = max_atoms
    self.batch_size = batch_size
    self.atom_features_placeholder = tf.placeholder(
        dtype='float32',
        shape=(self.batch_size * self.max_atoms, self.n_feat),
        name=self.name + '_atom_features')

    self.parents_placeholder = tf.placeholder(
        dtype='int32',
        shape=(self.batch_size * self.max_atoms, self.max_atoms,
               self.max_atoms),
        # molecule * atom(graph) => step => features
        name=self.name + '_parents')

    self.calculation_orders_placeholder = tf.placeholder(
        dtype='int32',
        shape=(self.batch_size * self.max_atoms, self.max_atoms),
        # molecule * atom(graph) => step
        name=self.name + '_orders')

    self.membership_placeholder = tf.placeholder(
        dtype='int32',
        shape=(self.batch_size * self.max_atoms),
        name=self.name + '_membership')

    # Define the list of tensors to be used as topology
    self.topology = [
        self.parents_placeholder, self.calculation_orders_placeholder,
        self.membership_placeholder
    ]

    self.inputs = [self.atom_features_placeholder]
    self.inputs += self.topology

  def get_parents_placeholder(self):
    return self.parents_placeholder

  def get_calculation_orders_placeholder(self):
    return self.calculation_orders_placeholder

  def batch_to_feed_dict(self, batch):
    """Converts the current batch of mol_graphs into tensorflow feed_dict.

    Assigns the graph information in array of ConvMol objects to the
    placeholders tensors for DAG models

    params
    ------
    batch : np.ndarray
      Array of ConvMol objects

    returns
    -------
    feed_dict : dict
      Can be merged with other feed_dicts for input into tensorflow
    """

    atoms_per_mol = [mol.get_num_atoms() for mol in batch]
    n_atom_features = batch[0].get_atom_features().shape[1]
    membership = np.concatenate(
        [
            np.array([1] * n_atoms + [0] * (self.max_atoms - n_atoms))
            for i, n_atoms in enumerate(atoms_per_mol)
        ],
        axis=0)

    atoms_all = []
    # calculation orders for a batch of molecules
    parents_all = []
    calculation_orders = []
    for idm, mol in enumerate(batch):
      # padding atom features vector of each molecule with 0
      atom_features_padded = np.concatenate(
          [
              mol.get_atom_features(), np.zeros(
                  (self.max_atoms - atoms_per_mol[idm], n_atom_features))
          ],
          axis=0)
      atoms_all.append(atom_features_padded)

      # calculation orders for DAGs
      parents = mol.parents
      # number of DAGs should equal number of atoms
      assert len(parents) == atoms_per_mol[idm]
      parents_all.extend(parents[:])
      # padding with `max_atoms`
      parents_all.extend([
          self.max_atoms * np.ones((self.max_atoms, self.max_atoms), dtype=int)
          for i in range(self.max_atoms - atoms_per_mol[idm])
      ])
      for parent in parents:
        # index for an atom in `parents_all` and `atoms_all` is different, 
        # this function changes the index from the position in current molecule(DAGs, `parents_all`) 
        # to position in batch of molecules(`atoms_all`)
        # only used in tf.gather on `atom_features_placeholder`
        calculation_orders.append(self.index_changing(parent[:, 0], idm))

      # padding with `batch_size*max_atoms`
      calculation_orders.extend([
          self.batch_size * self.max_atoms * np.ones(
              (self.max_atoms,), dtype=int)
          for i in range(self.max_atoms - atoms_per_mol[idm])
      ])

    atoms_all = np.concatenate(atoms_all, axis=0)
    parents_all = np.stack(parents_all, axis=0)
    calculation_orders = np.stack(calculation_orders, axis=0)
    atoms_dict = {
        self.atom_features_placeholder: atoms_all,
        self.membership_placeholder: membership,
        self.parents_placeholder: parents_all,
        self.calculation_orders_placeholder: calculation_orders
    }

    return atoms_dict

  def index_changing(self, index, n_mol):
    output = np.zeros_like(index)
    for ide, element in enumerate(index):
      if element < self.max_atoms:
        output[ide] = element + n_mol * self.max_atoms
      else:
        output[ide] = self.batch_size * self.max_atoms
    return output
Loading