Commit 90dde2da authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Changes for N2 up to simple fit test

parent d4804caa
Loading
Loading
Loading
Loading
+58 −24
Original line number Diff line number Diff line
@@ -455,7 +455,7 @@ def max_pair_distance_pairs(mol: RDKitMol,
    sum_adj += np.linalg.matrix_power(adj, power)
  nonzero_locs = np.where(sum_adj != 0)
  num_pairs = len(nonzero_locs[0])
  # This creates a mstrix of shape (2, num_pairs)
  # This creates a matrix of shape (2, num_pairs)
  pair_edges = np.reshape(np.array(list(zip(nonzero_locs))), (2, num_pairs))
  return pair_edges

@@ -489,13 +489,14 @@ def pair_features(
    The number of different bond types to consider.
  graph_distance: bool, optional (default True)
    If true, use graph distance between molecules. Else use euclidean
    distance. The specified `mol` must have a conformer. Atomic positions will
    be retrieved by calling `mol.getConformer(0)`.
    distance. The specified `mol` must have a conformer. Atomic
    positions will be retrieved by calling `mol.getConformer(0)`.
  max_pair_distance: Union[int, str], (default 'infinity')
    This value can be a positive integer or the string 'infinity'. This
    parameter determines the maximum graph distance at which pair features
    are computed. For example, if `max_pair_distance==2`, then pair features
    are computed only for atoms at most graph distance 2 apart.
    This value can be a positive integer or the string 'infinity'.
    This parameter determines the maximum graph distance at which pair
    features are computed. For example, if `max_pair_distance==2`,
    then pair features are computed only for atoms at most graph
    distance 2 apart.

  Note
  ----
@@ -504,8 +505,13 @@ def pair_features(
  Returns
  -------
  features: np.ndarray
    Of shape `(N, N, bt_len + max_distance + 1)`. This is the array of pairwise
    features for all atom pairs.
    Of shape `(N_edges, bt_len + max_distance + 1)`. This is the array
    of pairwise features for all atom pairs, where N_edges is the
    number of edges within max_pair_distance of one another in this
    molecules.
  pair_edges: np.ndarray
    Of shape `(2, num_pairs)` where `num_pairs` is the total number of
    pairs within `max_pair_distance` of one another.
  """
  if graph_distance:
    max_distance = 7
@@ -514,27 +520,55 @@ def pair_features(
  N = mol.GetNumAtoms()
  pair_edges = max_pair_distance_pairs(mol, max_pair_distance)
  num_pairs = pair_edges.shape[1]
  # TODO(rbharath): Figure out how to rewrite this to use the pair_edges correctly
  if max_pair_distance == "infinity":
    features = np.zeros((N, N, bt_len + max_distance + 1))
  N_edges = pair_edges.shape[1]
  features = np.zeros((N_edges, bt_len + max_distance + 1))
  #if max_pair_distance == "infinity":
  #  features = np.zeros((N, N, bt_len + max_distance + 1))
  #else:
  # Get mapping
  mapping = {}
  for n in range(N_edges):
    a1, a2 = pair_edges[:, n]
    mapping[(int(a1), int(a2))] = n
  num_atoms = mol.GetNumAtoms()
  rings = mol.GetRingInfo().AtomRings()
  for a1 in range(num_atoms):
    for a2 in bond_adj_list[a1]:
      # first `bt_len` features are bond features(if applicable)
      features[a1, a2, :bt_len] = np.asarray(
      if (int(a1), int(a2)) not in mapping:
        raise ValueError(
            "Malformed molecule with bonds not in specified graph distance.")
      else:
        n = mapping[(int(a1), int(a2))]
      features[n, :bt_len] = np.asarray(
          bond_features_map[tuple(sorted((a1, a2)))], dtype=float)
    for ring in rings:
      if a1 in ring:
        for a2 in ring:
          if (int(a1), int(a2)) not in mapping:
            # For ring pairs outside max pairs distance continue
            continue
          else:
            n = mapping[(int(a1), int(a2))]
          # `bt_len`-th feature is if the pair of atoms are in the same ring
        features[a1, ring, bt_len] = 1
        features[a1, a1, bt_len] = 0.
          if a2 == a1:
            features[n, bt_len] = 0
          else:
            features[n, bt_len] = 1
        #features[a1, ring, bt_len] = 1
        #features[a1, a1, bt_len] = 0.
    # graph distance between two atoms
    if graph_distance:
      # distance is a matrix of 1-hot encoded distances
      # distance is a matrix of 1-hot encoded distances for all atoms
      distance = find_distance(
          a1, num_atoms, bond_adj_list, max_distance=max_distance)
      features[a1, :, bt_len + 1:] = distance
      for a2 in range(num_atoms):
        if (int(a1), int(a2)) not in mapping:
          # For ring pairs outside max pairs distance continue
          continue
        else:
          n = mapping[(int(a1), int(a2))]
          features[n, bt_len + 1:] = distance[a2]
  # Euclidean distance between atoms
  if not graph_distance:
    coords = np.zeros((N, 3))
@@ -545,11 +579,11 @@ def pair_features(
      np.stack([coords] * N, axis=1) - \
      np.stack([coords] * N, axis=0)), axis=2))

  if max_pair_distance == "infinity":
    features = np.reshape(features, (N * N, bt_len + max_distance + 1))
    return features
  #if max_pair_distance == "infinity":
  #  features = np.reshape(features, (N * N, bt_len + max_distance + 1))
  #  return features

  return features
  return features, pair_edges


def find_distance(a1: RDKitAtom, num_atoms: int, bond_adj_list,
@@ -824,7 +858,7 @@ class WeaveFeaturizer(MolecularFeaturizer):
      bond_adj_list[bond[1]].append(bond[0])

    # Calculate pair features
    pairs = pair_features(
    pairs, pair_edges = pair_features(
        mol,
        bond_features_map,
        bond_adj_list,
@@ -832,7 +866,7 @@ class WeaveFeaturizer(MolecularFeaturizer):
        graph_distance=self.graph_distance,
        max_pair_distance=self.max_pair_distance)

    return WeaveMol(nodes, pairs)
    return WeaveMol(nodes, pairs, pair_edges)


class AtomicConvFeaturizer(ComplexNeighborListFragmentAtomicCoordinates):
+5 −1
Original line number Diff line number Diff line
@@ -379,11 +379,15 @@ class WeaveMol(object):
  .. [1] Kearnes, Steven, et al. "Molecular graph convolutions: moving beyond fingerprints." Journal of computer-aided molecular design 30.8 (2016): 595-608.
  """

  def __init__(self, nodes, pairs):
  def __init__(self, nodes, pairs, pair_edges):
    self.nodes = nodes
    self.pairs = pairs
    self.num_atoms = self.nodes.shape[0]
    self.n_features = self.nodes.shape[1]
    self.pair_edges = pair_edges

  def get_pair_edges(self):
    return self.pair_edges

  def get_pair_features(self):
    return self.pairs
+25 −20
Original line number Diff line number Diff line
@@ -33,6 +33,24 @@ def test_max_pair_distance_pairs():
  assert pair_edges.shape == (2, 9)


def test_max_pair_distance_infinity():
  """Test that max pair distance pairs are computed properly with infinity distance."""
  from rdkit import Chem
  # Test alkane
  mol = Chem.MolFromSmiles('CCC')
  # Test distance infinity
  pair_edges = max_pair_distance_pairs(mol, "infinity")
  # Everything is connected at this distance
  assert pair_edges.shape == (2, 9)

  # Test pentane
  mol = Chem.MolFromSmiles('CCCCC')
  # Test distance infinity
  pair_edges = max_pair_distance_pairs(mol, "infinity")
  # Everything is connected at this distance
  assert pair_edges.shape == (2, 25)


def test_weave_single_carbon():
  """Test that single carbon atom is featurized properly."""
  mols = ['C']
@@ -73,8 +91,10 @@ def test_weave_alkane_max_pairs():
  """Test on simple alkane with max pairs distance cutoff"""
  mols = ['CCC']
  featurizer = dc.feat.WeaveFeaturizer(max_pair_distance=1)
  mol_list = featurizer.featurize(mols)
  mol = mol_list[0]
  #mol_list = featurizer.featurize(mols)
  #mol = mol_list[0]
  from rdkit import Chem
  mol = featurizer._featurize(Chem.MolFromSmiles(mols[0]))

  # 3 carbonds in alkane
  assert mol.get_num_atoms() == 3
@@ -82,7 +102,9 @@ def test_weave_alkane_max_pairs():
  # Test feature sizes
  assert mol.get_num_features() == 75

  # Should be a 3x3 interaction grid
  # Should be a 7x14 interaction grid since there are 7 pairs within graph
  # distance 1 (3 self interactions plus 2 bonds counted twice because of
  # symmetry)
  assert mol.get_pair_features().shape == (7, 14)


@@ -105,20 +127,3 @@ def test_carbon_nitrogen():

  # Should be a 3x3 interaction grid
  assert mol.get_pair_features().shape == (5 * 5, 14)


#def test_alkane_max_pair_distance():
#  """Test on simple alkane with max_pair_distance < infinity"""
#  mols = ['CCC']
#  featurizer = dc.feat.WeaveFeaturizer(max_pair_distance=1)
#  mol_list = featurizer.featurize(mols)
#  mol = mol_list[0]
#
#  # 3 carbonds in alkane
#  assert mol.get_num_atoms() == 3
#
#  # Test feature sizes
#  assert mol.get_num_features() == 75
#
#  # Should be a 3x3 interaction grid
#  assert mol.get_pair_features().shape == (3, 1, 14)
+66 −35
Original line number Diff line number Diff line
@@ -197,7 +197,6 @@ class WeaveModel(KerasModel):
    self.n_classes = n_classes

    # Build the model.

    atom_features = Input(shape=(self.n_atom_feat[0],))
    pair_features = Input(shape=(self.n_pair_feat[0],))
    pair_split = Input(shape=tuple(), dtype=tf.int32)
@@ -277,6 +276,71 @@ class WeaveModel(KerasModel):
    super(WeaveModel, self).__init__(
        model, loss, output_types=output_types, batch_size=batch_size, **kwargs)

  def compute_features_on_batch(self, X_b):
    """Compute tensors that will be input into the model from featurized representation.

    The featurized input to `WeaveModel` is instances of `WeaveMol` created by
    `WeaveFeaturizer`. This method converts input `WeaveMol` objects into
    tensors used by the Keras implementation to compute `WeaveModel` outputs.

    Parameters
    ----------
    X_b: np.ndarray
      A numpy array with dtype=object where elements are `WeaveMol` objects.

    Returns
    -------
    atom_feat: np.ndarray
      Of shape `(N_atoms, N_atom_feat)`.
    pair_feat: np.ndarray
      Of shape `(N_pairs, N_pair_feat)`. Note that `N_pairs` will depend on
      the number of pairs being considered. If `max_pair_distance` is
      "infinity", then this will be `N_atoms**2`. Else it will be the number
      of pairs within the specifed graph distance.
    pair_split: np.ndarray
      Of shape `(N_pairs,)`. The i-th entry in this array will tell you the
      originating atom for this pair (the "source"). Note that pairs are
      symmetric so for a pair `(a, b)`, both `a` and `b` will separately be
      sources at different points in this array.
    atom_split: np.ndarray
      Of shape `(N_atoms,)`. The i-th entry in this array will be the molecule
      with the i-th atom belongs to.
    atom_to_pair: np.ndarray
      Of shape `(N_pairs, 2)`. The i-th row in this array will be the array
      `[a, b]` if `(a, b)` is a pair to be considered. (Note by symmetry, this
      implies some other row will contain `[b, a]`.
    """
    atom_feat = []
    pair_feat = []
    atom_split = []
    atom_to_pair = []
    pair_split = []
    start = 0
    for im, mol in enumerate(X_b):
      n_atoms = mol.get_num_atoms()
      # pair_edges is of shape (2, N)
      pair_edges = mol.get_pair_edges()
      N_pairs = pair_edges[1]
      # number of atoms in each molecule
      atom_split.extend([im] * n_atoms)
      # index of pair features
      C0, C1 = np.meshgrid(np.arange(n_atoms), np.arange(n_atoms))
      atom_to_pair.append(pair_edges.T + start)
      # Get starting pair atoms
      pair_starts = pair_edges.T[:, 0]
      # number of pairs for each atom
      pair_split.extend(pair_starts + start)
      start = start + n_atoms

      # atom features
      atom_feat.append(mol.get_atom_features())
      # pair features
      pair_feat.append(mol.get_pair_features())

    return (np.concatenate(atom_feat, axis=0), np.concatenate(
        pair_feat, axis=0), np.array(pair_split), np.array(atom_split),
            np.concatenate(atom_to_pair, axis=0))

  def default_generator(
      self,
      dataset: Dataset,
@@ -313,40 +377,7 @@ class WeaveModel(KerasModel):
          if self.mode == 'classification':
            y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape(
                -1, self.n_tasks, self.n_classes)
        atom_feat = []
        pair_feat = []
        atom_split = []
        atom_to_pair = []
        pair_split = []
        start = 0
        for im, mol in enumerate(X_b):
          n_atoms = mol.get_num_atoms()
          # number of atoms in each molecule
          atom_split.extend([im] * n_atoms)
          # index of pair features
          C0, C1 = np.meshgrid(np.arange(n_atoms), np.arange(n_atoms))
          atom_to_pair.append(
              np.transpose(
                  np.array([C1.flatten() + start,
                            C0.flatten() + start])))
          # number of pairs for each atom
          pair_split.extend(C1.flatten() + start)
          start = start + n_atoms

          # atom features
          atom_feat.append(mol.get_atom_features())
          # pair features
          pair_feat.append(
              np.reshape(mol.get_pair_features(),
                         (n_atoms * n_atoms, self.n_pair_feat[0])))

        inputs = [
            np.concatenate(atom_feat, axis=0),
            np.concatenate(pair_feat, axis=0),
            np.array(pair_split),
            np.array(atom_split),
            np.concatenate(atom_to_pair, axis=0)
        ]
        inputs = self.compute_features_on_batch(X_b)
        yield (inputs, [y_b], [w_b])


+116 −5
Original line number Diff line number Diff line
@@ -39,6 +39,88 @@ def get_dataset(mode='classification', featurizer='GraphConv', num_tasks=2):
  return tasks, ds, transformers, metric


def test_compute_features_on_infinity_distance():
  """Test that WeaveModel correctly transforms WeaveMol objects into tensors with infinite max_pair_distance."""
  featurizer = dc.feat.WeaveFeaturizer(max_pair_distance="infinity")
  X = featurizer(["C", "CCC"])
  batch_size = 20
  model = WeaveModel(
      1,
      batch_size=batch_size,
      mode='classification',
      fully_connected_layer_sizes=[2000, 1000],
      batch_normalize=True,
      batch_normalize_kwargs={
          "fused": False,
          "trainable": True,
          "renorm": True
      },
      learning_rage=0.0005)
  atom_feat, pair_feat, pair_split, atom_split, atom_to_pair = model.compute_features_on_batch(
      X)

  # There are 4 atoms each of which have 75 atom features
  assert atom_feat.shape == (4, 75)
  # There are 10 pairs with infinity distance and 14 pair features
  assert pair_feat.shape == (10, 14)
  # 4 atoms in total
  assert atom_split.shape == (4,)
  assert np.all(atom_split == np.array([0, 1, 1, 1]))
  # 10 pairs in total
  assert pair_split.shape == (10,)
  assert np.all(pair_split == np.array([0, 1, 1, 1, 2, 2, 2, 3, 3, 3]))
  # 10 pairs in total each with start/finish
  assert atom_to_pair.shape == (10, 2)
  assert np.all(
      atom_to_pair == np.array([[0, 0], [1, 1], [1, 2], [1, 3], [2, 1], [2, 2],
                                [2, 3], [3, 1], [3, 2], [3, 3]]))


def test_compute_features_on_distance_1():
  """Test that WeaveModel correctly transforms WeaveMol objects into tensors with finite max_pair_distance."""
  featurizer = dc.feat.WeaveFeaturizer(max_pair_distance=1)
  X = featurizer(["C", "CCC"])
  batch_size = 20
  model = WeaveModel(
      1,
      batch_size=batch_size,
      mode='classification',
      fully_connected_layer_sizes=[2000, 1000],
      batch_normalize=True,
      batch_normalize_kwargs={
          "fused": False,
          "trainable": True,
          "renorm": True
      },
      learning_rage=0.0005)
  atom_feat, pair_feat, pair_split, atom_split, atom_to_pair = model.compute_features_on_batch(
      X)

  # There are 4 atoms each of which have 75 atom features
  assert atom_feat.shape == (4, 75)
  # There are 8 pairs with distance 1 and 14 pair features. (To see why 8,
  # there's the self pair for "C". For "CCC" there are 7 pairs including self
  # connections and accounting for symmetry.)
  assert pair_feat.shape == (8, 14)
  # 4 atoms in total
  assert atom_split.shape == (4,)
  assert np.all(atom_split == np.array([0, 1, 1, 1]))
  # 10 pairs in total
  assert pair_split.shape == (8,)
  print("pair_split")
  print(pair_split)
  print("atom_to_pair")
  print(atom_to_pair)
  # The center atom is self connected and to both neighbors so it appears
  # thrice. The canonical ranking used in MolecularFeaturizer means this
  # central atom is ranked last in ordering.
  assert np.all(pair_split == np.array([0, 1, 1, 2, 2, 3, 3, 3]))
  # 10 pairs in total each with start/finish
  assert atom_to_pair.shape == (8, 2)
  assert np.all(atom_to_pair == np.array([[0, 0], [1, 1], [1, 3], [2, 2],
                                          [2, 3], [3, 1], [3, 2], [3, 3]]))


@flaky
@pytest.mark.slow
def test_weave_model():
@@ -84,16 +166,15 @@ def test_weave_regression_model():
  assert scores['mean_absolute_error'] < 0.1


def test_weave_fit_simple():
  featurizer = dc.feat.WeaveFeaturizer()
def test_weave_fit_simple_infinity_distance():
  featurizer = dc.feat.WeaveFeaturizer(max_pair_distance="infinity")
  X = featurizer(["C", "CCC"])
  y = np.random.randint(2, size=(2,))
  y = np.array([0, 1.])
  dataset = dc.data.NumpyDataset(X, y)
  tasks, dataset, transformers, metric = get_dataset('classification', 'Weave')

  batch_size = 20
  model = WeaveModel(
      len(tasks),
      1,
      batch_size=batch_size,
      mode='classification',
      fully_connected_layer_sizes=[2000, 1000],
@@ -105,5 +186,35 @@ def test_weave_fit_simple():
      },
      learning_rage=0.0005)
  model.fit(dataset, nb_epoch=200)
  transformers = []
  metric = dc.metrics.Metric(
      dc.metrics.roc_auc_score, np.mean, mode="classification")
  scores = model.evaluate(dataset, [metric], transformers)
  assert scores['mean-roc_auc_score'] >= 0.9


def test_weave_fit_simple_distance_1():
  featurizer = dc.feat.WeaveFeaturizer(max_pair_distance=1)
  X = featurizer(["C", "CCC"])
  y = np.array([0, 1.])
  dataset = dc.data.NumpyDataset(X, y)

  batch_size = 20
  model = WeaveModel(
      1,
      batch_size=batch_size,
      mode='classification',
      fully_connected_layer_sizes=[2000, 1000],
      batch_normalize=True,
      batch_normalize_kwargs={
          "fused": False,
          "trainable": True,
          "renorm": True
      },
      learning_rage=0.0005)
  model.fit(dataset, nb_epoch=200)
  transformers = []
  metric = dc.metrics.Metric(
      dc.metrics.roc_auc_score, np.mean, mode="classification")
  scores = model.evaluate(dataset, [metric], transformers)
  assert scores['mean-roc_auc_score'] >= 0.9