Unverified Commit de25d6c1 authored by Vignesh Ram Somnath's avatar Vignesh Ram Somnath Committed by GitHub
Browse files

AtomicConvFeaturizer and make_estimators

Add AtomicConvFeaturizer and make_estimators
parents 8b870301 049779ec
Loading
Loading
Loading
Loading
+152 −0
Original line number Diff line number Diff line
@@ -4,8 +4,18 @@ from __future__ import unicode_literals
import numpy as np
from rdkit import Chem

import deepchem as dc
from deepchem.feat import Featurizer
from deepchem.feat.atomic_coordinates import ComplexNeighborListFragmentAtomicCoordinates
from deepchem.feat.mol_graphs import ConvMol, WeaveMol
from deepchem.data import DiskDataset
import multiprocessing
import logging


def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message):
  logging.info(log_message)
  return featurizer._featurize_complex(mol_pdb_file, protein_pdb_file)


def one_of_k_encoding(x, allowable_set):
@@ -424,3 +434,145 @@ class WeaveFeaturizer(Featurizer):
        graph_distance=self.graph_distance)

    return WeaveMol(nodes, pairs)


class AtomicConvFeaturizer(ComplexNeighborListFragmentAtomicCoordinates):
  """This class computes the Atomic Convolution features"""

  # TODO (VIGS25): Complete the description

  name = ['atomic_conv']

  def __init__(self,
               labels,
               neighbor_cutoff,
               frag1_num_atoms=70,
               frag2_num_atoms=634,
               complex_num_atoms=701,
               max_num_neighbors=12,
               batch_size=24,
               atom_types=[
                   6, 7., 8., 9., 11., 12., 15., 16., 17., 20., 25., 30., 35.,
                   53., -1.
               ],
               radial=[[
                   1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0,
                   7.5, 8.0, 8.5, 9.0, 9.5, 10.0, 10.5, 11.0, 11.5, 12.0
               ], [0.0, 4.0, 8.0], [0.4]],
               layer_sizes=[32, 32, 16],
               strip_hydrogens=True,
               learning_rate=0.001,
               epochs=10):
    """
    Parameters

    labels: numpy.ndarray
      Labels which we want to predict using the model
    neighbor_cutoff: int
      TODO (VIGS25): Add description
    frag1_num_atoms: int
      Number of atoms in first fragment
    frag2_num_atoms: int
      Number of atoms in second fragment
    complex_num_atoms: int
      TODO (VIGS25) : Add description
    max_num_neighbors: int
      Maximum number of neighbors possible for an atom
    batch_size: int
      Batch size used for training and evaluation
    atom_types: list
      List of atoms recognized by model. Atoms are indicated by their
      nuclear numbers.
    radial: list
      TODO (VIGS25): Add description
    layer_sizes: list
      List of layer sizes for the AtomicConvolutional Network
    strip_hydrogens: bool
      Whether to remove hydrogens while computing neighbor features
    learning_rate: float
      Learning rate for training the model
    epochs: int
      Number of epochs to train the model for
    """

    self.atomic_conv_model = dc.models.tensorgraph.models.atomic_conv.AtomicConvModel(
        frag1_num_atoms=frag1_num_atoms,
        frag2_num_atoms=frag2_num_atoms,
        complex_num_atoms=complex_num_atoms,
        max_num_neighbors=max_num_neighbors,
        batch_size=batch_size,
        atom_types=atom_types,
        radial=radial,
        layer_sizes=layer_sizes,
        learning_rate=learning_rate)

    super(AtomicConvFeaturizer, self).__init__(
        frag1_num_atoms=frag1_num_atoms,
        frag2_num_atoms=frag2_num_atoms,
        complex_num_atoms=complex_num_atoms,
        max_num_neighbors=max_num_neighbors,
        neighbor_cutoff=neighbor_cutoff,
        strip_hydrogens=strip_hydrogens)

    self.epochs = epochs
    self.labels = labels

  def featurize_complexes(self, mol_files, protein_files):
    pool = multiprocessing.Pool()
    results = []
    for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_files)):
      log_message = "Featurizing %d / %d" % (i, len(mol_files))
      results.append(
          pool.apply_async(_featurize_complex,
                           (self, mol_file, protein_pdb, log_message)))
    pool.close()
    features = []
    failures = []
    for ind, result in enumerate(results):
      new_features = result.get()
      # Handle loading failures which return None
      if new_features is not None:
        features.append(new_features)
      else:
        failures.append(ind)

    features = np.asarray(features)
    labels = np.delete(self.labels, failures)
    dataset = DiskDataset.from_numpy(features, labels)

    # Fit atomic conv model
    self.atomic_conv_model.fit(dataset, nb_epoch=self.epochs)

    # Add the Atomic Convolution layers to fetches
    layers_to_fetch = list()
    for layer in self.atomic_conv_model.layers.values():
      if isinstance(layer,
                    dc.models.tensorgraph.models.atomic_conv.AtomicConvolution):
        layers_to_fetch.append(layer)

    # Extract the atomic convolution features
    atomic_conv_features = list()
    feed_dict_generator = self.atomic_conv_model.default_generator(
        dataset=dataset, epochs=1)

    for feed_dict in self.atomic_conv_model._create_feed_dicts(
        feed_dict_generator, training=False):
      frag1_conv, frag2_conv, complex_conv = self.atomic_conv_model._run_graph(
          outputs=layers_to_fetch, feed_dict=feed_dict, training=False)
      concatenated = np.concatenate(
          [frag1_conv, frag2_conv, complex_conv], axis=1)
      atomic_conv_features.append(concatenated)

    batch_size = self.atomic_conv_model.batch_size

    if len(features) % batch_size != 0:
      num_batches = (len(features) // batch_size) + 1
      num_to_skip = num_batches * batch_size - len(features)
    else:
      num_to_skip = 0

    atomic_conv_features = np.asarray(atomic_conv_features)
    atomic_conv_features = atomic_conv_features[-num_to_skip:]
    atomic_conv_features = np.squeeze(atomic_conv_features)

    return atomic_conv_features, failures
+34 −4
Original line number Diff line number Diff line
@@ -10,11 +10,8 @@ __license__ = "MIT"

import unittest
import os
import sys
import numpy as np
from deepchem.feat.mol_graphs import ConvMol
from deepchem.feat.mol_graphs import MultiConvMol
from deepchem.feat.graph_features import ConvMolFeaturizer
from deepchem.feat.graph_features import ConvMolFeaturizer, AtomicConvFeaturizer


class TestConvMolFeaturizer(unittest.TestCase):
@@ -95,3 +92,36 @@ class TestConvMolFeaturizer(unittest.TestCase):
    assert np.array_equal(deg_adj_lists[4], np.zeros([0, 4], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[5], np.zeros([0, 5], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[6], np.zeros([0, 6], dtype=np.int32))


class TestAtomicConvFeaturizer(unittest.TestCase):

  def test_feature_generation(self):
    """Test if featurization works using AtomicConvFeaturizer."""
    dir_path = os.path.dirname(os.path.realpath(__file__))
    ligand_file = os.path.join(dir_path, "data/3zso_ligand_hyd.pdb")
    protein_file = os.path.join(dir_path, "data/3zso_protein.pdb")
    # Pulled from PDB files. For larger datasets with more PDBs, would use
    # max num atoms instead of exact.

    frag1_num_atoms = 44  # for ligand atoms
    frag2_num_atoms = 2336  # for protein atoms
    complex_num_atoms = 2380  # in total
    max_num_neighbors = 4
    # Cutoff in angstroms
    neighbor_cutoff = 4

    labels = np.array([0, 0])

    featurizer = AtomicConvFeaturizer(
        labels=labels,
        batch_size=1,
        epochs=1,
        frag1_num_atoms=frag1_num_atoms,
        frag2_num_atoms=frag2_num_atoms,
        complex_num_atoms=complex_num_atoms,
        max_num_neighbors=max_num_neighbors,
        neighbor_cutoff=neighbor_cutoff)

    features, _ = featurizer.featurize_complexes([ligand_file, ligand_file],
                                                 [protein_file, protein_file])
+69 −32
Original line number Diff line number Diff line
@@ -311,24 +311,13 @@ class DTNNModel(TensorGraph):
    weighted_loss = ReduceSum(L2Loss(in_layers=[labels, output, weights]))
    self.set_loss(weighted_loss)

  def default_generator(self,
                        dataset,
                        epochs=1,
                        predict=False,
                        deterministic=True,
                        pad_batches=True):
    """TensorGraph style implementation"""
    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):
  def compute_features_on_batch(self, X_b):
    """Computes the values for different Feature Layers on given batch

        feed_dict = dict()
        if y_b is not None:
          feed_dict[self.labels[0]] = y_b
        if w_b is not None:
          feed_dict[self.task_weights[0]] = w_b
    A tf.py_func wrapper is written around this when creating the
    input_fn for tf.Estimator

    """
    distance = []
    atom_membership = []
    distance_membership_i = []
@@ -352,18 +341,66 @@ class DTNNModel(TensorGraph):
      distance_membership_i.append(membership_i + start)
      distance_membership_j.append(membership_j + start)
      start = start + num_atoms[im]
        feed_dict[self.atom_number] = np.concatenate(atom_number)
        distance = np.concatenate(distance, 0)
        feed_dict[self.distance] = np.exp(

    atom_number = np.concatenate(atom_number).astype(np.int32)
    distance = np.concatenate(distance, axis=0)
    gaussian_dist = np.exp(
        -np.square(distance - self.steps) / (2 * self.step_size**2))
        feed_dict[self.distance_membership_i] = np.concatenate(
            distance_membership_i)
        feed_dict[self.distance_membership_j] = np.concatenate(
            distance_membership_j)
        feed_dict[self.atom_membership] = np.concatenate(atom_membership)
    gaussian_dist = gaussian_dist.astype(np.float32)
    atom_mem = np.concatenate(atom_membership).astype(np.int32)
    dist_mem_i = np.concatenate(distance_membership_i).astype(np.int32)
    dist_mem_j = np.concatenate(distance_membership_j).astype(np.int32)

    features = [atom_number, gaussian_dist, dist_mem_i, dist_mem_j, atom_mem]

    return features

  def default_generator(self,
                        dataset,
                        epochs=1,
                        predict=False,
                        deterministic=True,
                        pad_batches=True):
    """TensorGraph style implementation"""
    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):

        feed_dict = dict()
        if y_b is not None:
          feed_dict[self.labels[0]] = y_b
        if w_b is not None:
          feed_dict[self.task_weights[0]] = w_b

        features = self.compute_features_on_batch(X_b)
        feed_dict[self.atom_number] = features[0]
        feed_dict[self.distance] = features[1]
        feed_dict[self.distance_membership_i] = features[2]
        feed_dict[self.distance_membership_j] = features[3]
        feed_dict[self.atom_membership] = features[4]

        yield feed_dict

  def create_estimator_inputs(self, feature_columns, weight_column, features,
                              labels, mode):
    tensors = dict()
    for layer, column in zip(self.features, feature_columns):
      feature_col = tf.feature_column.input_layer(features, [column])
      if column.dtype != feature_col.dtype:
        feature_col = tf.cast(feature_col, column.dtype)
      if len(column.shape) < 1:
        feature_col = tf.reshape(feature_col, shape=[tf.shape(feature_col)[0]])
      tensors[layer] = feature_col
    if weight_column is not None:
      tensors[self.task_weights[0]] = tf.feature_column.input_layer(
          features, [weight_column])
    if labels is not None:
      tensors[self.labels[0]] = labels

    return tensors


class DAGModel(TensorGraph):

+71 −0
Original line number Diff line number Diff line
@@ -5,6 +5,9 @@ import deepchem as dc
import deepchem.models.tensorgraph.layers as layers
from deepchem.data import NumpyDataset
from deepchem.models.tensorgraph.models.text_cnn import default_dict
from scipy.io import loadmat
from flaky import flaky
import os


class TestEstimators(unittest.TestCase):
@@ -453,3 +456,71 @@ class TestEstimators(unittest.TestCase):
    # Train the model.

    estimator.train(input_fn=lambda: input_fn(100))

  @flaky
  def test_dtnn_regression_model(self):
    """Test creating an estimator for DTNNGraphModel for regression"""
    data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../")
    input_file = os.path.join(data_dir, "models", "example_DTNN.mat")
    dataset = loadmat(input_file)

    np.random.seed(123)
    X = dataset['X']
    y = dataset['T'].astype(np.float32)
    w = np.ones_like(y)
    dataset = dc.data.NumpyDataset(X, y, w, ids=None)
    n_tasks = y.shape[1]
    n_samples = y.shape[0]

    dtypes = [tf.int32, tf.float32, tf.int32, tf.int32, tf.int32]

    model = dc.models.DTNNModel(
        n_tasks,
        n_embedding=20,
        n_distance=100,
        learning_rate=1.0,
        mode="regression")

    def mean_relative_error(labels, predictions, weights):
      error = tf.abs(1 - tf.div(labels, predictions))
      error_val, update_op = tf.metrics.mean(error)
      return error_val, update_op

    def input_fn(batch_size, epochs):
      X, y, weights = dataset.make_iterator(
          batch_size=batch_size, epochs=epochs).get_next()
      features = tf.py_func(
          model.compute_features_on_batch, inp=[X], Tout=dtypes)

      assert len(features) == 5
      feature_dict = dict()
      feature_dict['atom_num'] = features[0]
      feature_dict['distance'] = features[1]
      feature_dict['dist_mem_i'] = features[2]
      feature_dict['dist_mem_j'] = features[3]
      feature_dict['atom_mem'] = features[4]
      feature_dict['weights'] = weights

      return feature_dict, y

    atom_number = tf.feature_column.numeric_column(
        'atom_num', shape=[], dtype=dtypes[0])
    distance = tf.feature_column.numeric_column(
        'distance', shape=(model.n_distance,), dtype=dtypes[1])
    atom_mem = tf.feature_column.numeric_column(
        'atom_mem', shape=[], dtype=dtypes[2])
    dist_mem_i = tf.feature_column.numeric_column(
        'dist_mem_i', shape=[], dtype=dtypes[3])
    dist_mem_j = tf.feature_column.numeric_column(
        'dist_mem_j', shape=[], dtype=dtypes[4])

    weight_col = tf.feature_column.numeric_column('weights', shape=(n_tasks,))
    metrics = {'error': mean_relative_error}

    feature_cols = [atom_number, distance, dist_mem_i, dist_mem_j, atom_mem]
    estimator = model.make_estimator(
        feature_columns=feature_cols, weight_column=weight_col, metrics=metrics)
    estimator.train(input_fn=lambda: input_fn(100, 250))

    results = estimator.evaluate(input_fn=lambda: input_fn(n_samples, 1))
    assert results['error'] < 0.1
+18 −0
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ import logging
import tarfile
from deepchem.feat import rdkit_grid_featurizer as rgf
from deepchem.feat.atomic_coordinates import ComplexNeighborListFragmentAtomicCoordinates
from deepchem.feat.graph_features import AtomicConvFeaturizer

logger = logging.getLogger(__name__)

@@ -233,6 +234,22 @@ def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
        frag1_num_atoms, frag2_num_atoms, complex_num_atoms, max_num_neighbors,
        neighbor_cutoff)

  elif featurizer == "atomic_conv":
    frag1_num_atoms = 70  # for ligand atoms
    frag2_num_atoms = 24000  # for protein atoms
    complex_num_atoms = 24070  # in total
    max_num_neighbors = 4
    # Cutoff in angstroms
    neighbor_cutoff = 4
    featurizer = AtomicConvFeaturizer(
        labels=labels,
        frag1_num_atoms=frag1_num_atoms,
        frag2_num_atoms=frag2_num_atoms,
        complex_num_atoms=complex_num_atoms,
        neighbor_cutoff=neighbor_cutoff,
        max_num_neighbors=max_num_neighbors,
        batch_size=64)

  else:
    raise ValueError("Featurizer not supported")
  print("Featurizing Complexes")
@@ -241,6 +258,7 @@ def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
  # Delete labels for failing elements
  labels = np.delete(labels, failures)
  dataset = deepchem.data.DiskDataset.from_numpy(features, labels)
  print('Featurization complete.')
  # No transformations of data
  transformers = []
  if split == None: