Commit d69e919c authored by miaecle's avatar miaecle
Browse files

weave model first draft

parent 8731ebf8
Loading
Loading
Loading
Loading
+6 −1
Original line number Diff line number Diff line
@@ -192,13 +192,15 @@ def pair_features(mol, edge_list, canon_adj_list, bt_len=6):
  rings = mol.GetRingInfo().AtomRings()
  for a1 in range(num_atoms):
    for a2 in canon_adj_list[a1]:
      # first `bt_len` features are bond features(if applicable)
      features[a1, a2, :bt_len] = np.asarray(
          edge_list[tuple(sorted((a1, a2)))], dtype=float)
    for ring in rings:
      if a1 in ring:
        # `bt_len`-th feature is if the pair of atoms are in the same ring
        features[a1, ring, bt_len] = 1
        features[a1, a1, bt_len] = 0.
    # find graph distance between two atoms
    # graph distance between two atoms
    distance = find_distance(
        a1, num_atoms, canon_adj_list, max_distance=max_distance)
    features[a1, :, bt_len + 1:] = distance
@@ -209,11 +211,14 @@ def pair_features(mol, edge_list, canon_adj_list, bt_len=6):
def find_distance(a1, num_atoms, canon_adj_list, max_distance=7):
  distance = np.zeros((num_atoms, max_distance))
  radial = 0
  # atoms `radial` bonds away from `a1`
  adj_list = set(canon_adj_list[a1])
  # atoms less than `radial` bonds away
  all_list = set([a1])
  while radial < max_distance:
    distance[list(adj_list), radial] = 1
    all_list.update(adj_list)
    # find atoms `radial`+1 bonds away
    next_adj = set()
    for adj in adj_list:
      next_adj.update(canon_adj_list[adj])
+99 −0
Original line number Diff line number Diff line
@@ -752,6 +752,105 @@ class TestOverfit(test_util.TensorFlowTestCase):

    assert scores[regression_metric.name] > .8

  def test_weave_singletask_classification_overfit(self):
    """Test weave model overfits tiny data."""
    np.random.seed(123)
    tf.set_random_seed(123)
    n_tasks = 1

    # Load mini log-solubility dataset.
    featurizer = dc.feat.WeaveFeaturizer()
    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir, "example_classification.csv")
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(input_file)

    classification_metric = dc.metrics.Metric(dc.metrics.accuracy_score)

    n_atom_feat = 75
    n_pair_feat = 14
    n_feat = 128
    batch_size = 10
    max_atoms = 50

    graph = dc.nn.SequentialWeaveGraph(
        max_atoms=max_atoms, n_atom_feat=n_atom_feat, n_pair_feat=n_pair_feat)
    graph.add(dc.nn.WeaveLayer(max_atoms, 75, 14))
    graph.add(dc.nn.WeaveConcat(batch_size, n_output=n_feat))
    graph.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
    graph.add(dc.nn.WeaveGather(batch_size, n_input=n_feat))

    model = dc.models.MultitaskGraphClassifier(
        graph,
        n_tasks,
        n_feat,
        batch_size=batch_size,
        learning_rate=1e-3,
        learning_rate_decay_time=1000,
        optimizer_type="adam",
        beta1=.9,
        beta2=.999)

    # Fit trained model
    model.fit(dataset, nb_epoch=20)
    model.save()

    # Eval model on train
    scores = model.evaluate(dataset, [classification_metric])

    assert scores[classification_metric.name] > .65

  def test_weave_singletask_regression_overfit(self):
    """Test weave model overfits tiny data."""
    np.random.seed(123)
    tf.set_random_seed(123)
    n_tasks = 1

    # Load mini log-solubility dataset.
    featurizer = dc.feat.WeaveFeaturizer()
    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir, "example_regression.csv")
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(input_file)

    regression_metric = dc.metrics.Metric(
        dc.metrics.pearson_r2_score, task_averager=np.mean)

    n_atom_feat = 75
    n_pair_feat = 14
    n_feat = 128
    batch_size = 10
    max_atoms = 50

    graph = dc.nn.SequentialWeaveGraph(
        max_atoms=max_atoms, n_atom_feat=n_atom_feat, n_pair_feat=n_pair_feat)
    graph.add(dc.nn.WeaveLayer(max_atoms, 75, 14))
    graph.add(dc.nn.WeaveConcat(batch_size, n_output=n_feat))
    graph.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
    graph.add(dc.nn.WeaveGather(batch_size, n_input=n_feat))

    model = dc.models.MultitaskGraphRegressor(
        graph,
        n_tasks,
        n_feat,
        batch_size=batch_size,
        learning_rate=1e-3,
        learning_rate_decay_time=1000,
        optimizer_type="adam",
        beta1=.9,
        beta2=.999)

    # Fit trained model
    model.fit(dataset, nb_epoch=40)
    model.save()

    # Eval model on train
    scores = model.evaluate(dataset, [regression_metric])

    assert scores[regression_metric.name] > .9

  def test_siamese_singletask_classification_overfit(self):
    """Test siamese singletask model overfits tiny data."""
    np.random.seed(123)
+2 −0
Original line number Diff line number Diff line
@@ -32,6 +32,8 @@ def load_tox21(featurizer='ECFP', split='index', K=4):
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Weave':
    featurizer = deepchem.feat.WeaveFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()

+40 −20
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ from deepchem.nn import initializations
from deepchem.nn import model_ops
from deepchem.nn.copy import Layer


class WeaveLayer(Layer):
  """" Main layer of Weave model
  For each molecule, atom features and pair features are recombined to 
@@ -24,6 +25,7 @@ class WeaveLayer(Layer):
  """

  def __init__(self,
               max_atoms,
               n_atom_input_feat=75,
               n_pair_input_feat=14,
               n_atom_output_feat=50,
@@ -58,7 +60,7 @@ class WeaveLayer(Layer):

    """
    super(WeaveLayer, self).__init__(**kwargs)

    self.max_atoms = max_atoms
    self.init = initializations.get(init)  # Set weight initialization
    self.activation = activations.get(activation)  # Get activations
    self.n_hidden_AA = n_hidden_AA
@@ -107,9 +109,10 @@ class WeaveLayer(Layer):
        self.n_pair_output_feat,
    ])

    self.trainable_weights = [self.W_AA, self.b_AA, self.W_PA, self.b_PA,
        self.W_A, self.b_A, self.W_AP, self.b_AP, self.W_PP, self.b_PP,
        self.W_P, self.b_P]
    self.trainable_weights = [
        self.W_AA, self.b_AA, self.W_PA, self.b_PA, self.W_A, self.b_A,
        self.W_AP, self.b_AP, self.W_PP, self.b_PP, self.W_P, self.b_P
    ]

  def call(self, x, mask=None):
    """Execute this layer on input tensors.
@@ -138,7 +141,7 @@ class WeaveLayer(Layer):

    atom_mask = x[2]
    pair_mask = x[3]
    max_atoms = atom_features.get_shape().as_list()[1]
    max_atoms = self.max_atoms

    AA = tf.tensordot(atom_features, self.W_AA, [[2], [0]]) + self.b_AA
    AA = self.activation(AA)
@@ -164,6 +167,7 @@ class WeaveLayer(Layer):
    P = tf.multiply(P, tf.expand_dims(pair_mask, axis=3))
    return A, P


class WeaveConcat(Layer):
  """" Concat a batch of molecules into a batch of atoms
  """
@@ -202,7 +206,9 @@ class WeaveConcat(Layer):
    """

    self.W = self.init([self.n_atom_input_feat, self.n_output])
    self.b = model_ops.zeros(shape=[self.n_output,])
    self.b = model_ops.zeros(shape=[
        self.n_output,
    ])

    self.trainable_weights = self.W + self.b

@@ -227,12 +233,15 @@ class WeaveConcat(Layer):
    atom_features = x[0]
    atom_masks = x[1]
    A = tf.split(atom_features, self.batch_size, axis=0)
    A_mask = tf.split(tf.cast(atom_masks, dtype=tf.bool), self.batch_size, axis=0)
    outputs = tf.concat([tf.boolean_mask(A[i], A_mask[i]) for i in range(len(A))], axis=0)
    A_mask = tf.split(
        tf.cast(atom_masks, dtype=tf.bool), self.batch_size, axis=0)
    outputs = tf.concat(
        [tf.boolean_mask(A[i], A_mask[i]) for i in range(len(A))], axis=0)
    outputs = tf.matmul(outputs, self.W) + self.b
    outputs = self.activation(outputs)
    return outputs


class WeaveGather(Layer):
  """" Gather layer of Weave model
  a batch of normalized atom features go through a hidden layer, 
@@ -241,7 +250,8 @@ class WeaveGather(Layer):

  def __init__(self,
               batch_size,
               gaussian_expand=True,
               n_input=128,
               gaussian_expand=False,
               epsilon=1e-3,
               momentum=0.99,
               **kwargs):
@@ -254,7 +264,7 @@ class WeaveGather(Layer):
      Whether to expand each dimension of atomic features by gaussian histogram

    """

    self.n_input = n_input
    self.batch_size = batch_size
    self.gaussian_expand = gaussian_expand
    self.epsilon = epsilon
@@ -296,5 +306,15 @@ class WeaveGather(Layer):
    output_molecules = tf.stack(output_molecules)
    return output_molecules

  def gaussian_histogram(x):
    return x
  def gaussian_histogram(self, x):
    gaussian_memberships = [(-1.645, 0.080), (-1.080, 0.029), (-0.739, 0.018),
                            (-0.468, 0.014), (-0.228, 0.013), (0., 0.013),
                            (0.228, 0.013), (0.468, 0.014), (0.739, 0.018),
                            (1.080, 0.029), (1.645, 0.080)]
    dist = [
        tf.contrib.distributions.Normal(mu=p[0], sigma=p[1])
        for p in gaussian_memberships
    ]
    outputs = [dist[i].pdf(x) for i in range(11)]
    outputs = tf.concat(outputs, axis=1)
    return outputs
+10 −12
Original line number Diff line number Diff line
@@ -24,35 +24,33 @@ max_atoms_valid = max([mol.get_num_atoms() for mol in valid_dataset.X])
max_atoms_test = max([mol.get_num_atoms() for mol in test_dataset.X])
max_atoms = max([max_atoms_train, max_atoms_valid, max_atoms_test])


n_atom_feat = 75
n_pair_feat = 14
max_atoms = 55
# Batch size of models
batch_size = 64
n_output = 128
graph = dc.nn.SequentialWeaveGraph(max_atoms=max_atoms,
                                   n_atom_feat=n_atom_feat,
                                   n_pair_feat=n_pair_feat)
n_feat = 128
graph = dc.nn.SequentialWeaveGraph(
    max_atoms=max_atoms, n_atom_feat=n_atom_feat, n_pair_feat=n_pair_feat)

graph.add(dc.nn.WeaveLayer())
graph.add(dc.nn.WeaveConcat(batch_size))
graph.add(dc.nn.WeaveLayer(max_atoms, 75, 14))
#graph.add(dc.nn.WeaveLayer(max_atoms, 50, 50))
graph.add(dc.nn.WeaveConcat(batch_size, n_output=n_feat))
graph.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
graph.add(dc.nn.WeaveGather(batch_size, gaussian_expand=False))
graph.add(dc.nn.WeaveGather(batch_size, n_input=n_feat, gaussian_expand=False))

model = dc.models.MultitaskGraphRegressor(
    graph,
    len(delaney_tasks),
    128,
    n_feat,
    batch_size=batch_size,
    learning_rate=1e-4,
    learning_rate=1e-3,
    learning_rate_decay_time=1000,
    optimizer_type="adam",
    beta1=.9,
    beta2=.999)

# Fit trained model
model.fit(train_dataset, nb_epoch=40, log_every_N_batches=50)
model.fit(train_dataset, nb_epoch=50, log_every_N_batches=50)
print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
Loading