Unverified Commit 9f7fa857 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1411 from rbharath/ch5_changes

Upgrades to Atomic Conv Support
parents 06dddf40 b09ee6ad
Loading
Loading
Loading
Loading
+22 −0
Original line number Diff line number Diff line
@@ -1313,6 +1313,28 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
                channel_power=None,
                nb_channel=16,
                dtype="np.int8"):
    """Private helper function to voxelize inputs.

    Parameters
    ----------
    get_voxels: function
      Function that voxelizes inputs
    hash_function: function
      Used to map feature choices to voxel channels.  
    coordinates: np.ndarray
      Contains the 3D coordinates of a molecular system.
    feature_dict: Dictionary
      Keys are atom indices.  
    feature_list: list
      List of available features. 
    channel_power: int
      If specified, nb_channel is set to 2**channel_power.
      TODO: This feels like a redundant parameter.
    nb_channel: int
      The number of feature channels computed per voxel 
    dtype: type
      The dtype of the numpy ndarray created to hold features.
    """

    if channel_power is not None:
      if channel_power == 0:
+1 −0
Original line number Diff line number Diff line
@@ -27,6 +27,7 @@ from deepchem.models.tensorgraph.models.text_cnn import TextCNNModel
from deepchem.models.tensorgraph.sequential import Sequential
from deepchem.models.tensorgraph.models.sequence_dnn import SequenceDNN
from deepchem.models.tensorgraph.models.ontology import OntologyModel, OntologyNode, create_gene_ontology
from deepchem.models.tensorgraph.models.atomic_conv import AtomicConvModel

#################### Compatibility imports for renamed TensorGraph models. Remove below with DeepChem 3.0. ####################

+75 −0
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@ from __future__ import unicode_literals

from nose.plugins.attrib import attr

import os
import deepchem
import numpy as np
import tensorflow as tf
@@ -14,6 +15,7 @@ import numpy as np
from deepchem.models.tensorgraph.models import atomic_conv
from deepchem.models.tensorgraph import layers
from deepchem.data import NumpyDataset
from deepchem.feat.atomic_coordinates import ComplexNeighborListFragmentAtomicCoordinates


class TestAtomicConv(unittest.TestCase):
@@ -63,3 +65,76 @@ class TestAtomicConv(unittest.TestCase):
    labels = np.zeros(batch_size)
    train = NumpyDataset(features, labels)
    atomic_convnet.fit(train, nb_epoch=1)

  @attr("slow")
  def test_atomic_conv_variable(self):
    """A simple test that initializes and fits an AtomicConvModel on variable input size."""
    # For simplicity, let's assume both molecules have same number of
    # atoms.
    frag1_num_atoms = 1000
    frag2_num_atoms = 1200
    complex_num_atoms = frag1_num_atoms + frag2_num_atoms
    batch_size = 1
    atomic_convnet = atomic_conv.AtomicConvModel(
        batch_size=batch_size,
        frag1_num_atoms=frag1_num_atoms,
        frag2_num_atoms=frag2_num_atoms,
        complex_num_atoms=complex_num_atoms)

    # Creates a set of dummy features that contain the coordinate and
    # neighbor-list features required by the AtomicConvModel.
    features = []
    frag1_coords = np.random.rand(frag1_num_atoms, 3)
    frag1_nbr_list = {i: [] for i in range(frag1_num_atoms)}
    frag1_z = np.random.randint(10, size=(frag1_num_atoms))
    frag2_coords = np.random.rand(frag2_num_atoms, 3)
    frag2_nbr_list = {i: [] for i in range(frag2_num_atoms)}
    frag2_z = np.random.randint(10, size=(frag2_num_atoms))
    system_coords = np.random.rand(complex_num_atoms, 3)
    system_nbr_list = {i: [] for i in range(complex_num_atoms)}
    system_z = np.random.randint(10, size=(complex_num_atoms))

    features.append(
        (frag1_coords, frag1_nbr_list, frag1_z, frag2_coords, frag2_nbr_list,
         frag2_z, system_coords, system_nbr_list, system_z))
    features = np.asarray(features)
    labels = np.zeros(batch_size)
    train = NumpyDataset(features, labels)
    atomic_convnet.fit(train, nb_epoch=1)

  @attr("slow")
  def test_atomic_conv_with_feat(self):
    """A simple test for running an atomic convolution on featurized data."""
    dir_path = os.path.dirname(os.path.realpath(__file__))
    ligand_file = os.path.join(dir_path,
                               "../../../feat/tests/data/3zso_ligand_hyd.pdb")
    protein_file = os.path.join(dir_path,
                                "../../../feat/tests/data/3zso_protein.pdb")
    # Pulled from PDB files. For larger datasets with more PDBs, would use
    # max num atoms instead of exact.
    frag1_num_atoms = 44  # for ligand atoms
    frag2_num_atoms = 2336  # for protein atoms
    complex_num_atoms = 2380  # in total
    max_num_neighbors = 4
    # Cutoff in angstroms
    neighbor_cutoff = 4
    complex_featurizer = ComplexNeighborListFragmentAtomicCoordinates(
        frag1_num_atoms, frag2_num_atoms, complex_num_atoms, max_num_neighbors,
        neighbor_cutoff)
    # arbitrary label
    labels = np.array([0])
    features, _ = complex_featurizer.featurize_complexes([ligand_file],
                                                         [protein_file])
    dataset = deepchem.data.DiskDataset.from_numpy(features, labels)

    batch_size = 1
    print("Constructing Atomic Conv model")
    atomic_convnet = atomic_conv.AtomicConvModel(
        batch_size=batch_size,
        frag1_num_atoms=frag1_num_atoms,
        frag2_num_atoms=frag2_num_atoms,
        complex_num_atoms=complex_num_atoms)

    print("About to call fit")
    # Run a fitting operation
    atomic_convnet.fit(dataset)
+6 −8
Original line number Diff line number Diff line
@@ -209,21 +209,19 @@ def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
  labels = np.array(labels)
  # Featurize Data
  if featurizer == "grid":
    # TODO: This is not the correct setting. Set hyperparameters correctly
    ecfp_power = 5
    splif_power = 5
    featurizer = rgf.RdkitGridFeaturizer(
        voxel_width=16.0,
        feature_types=['ecfp', 'splif', 'hbond', 'salt_bridge'],
        ecfp_power=ecfp_power,
        splif_power=splif_power,
        voxel_width=2.0,
        feature_types=[
            'ecfp', 'splif', 'hbond', 'salt_bridge', 'pi_stack', 'cation_pi',
            'charge'
        ],
        flatten=True)
  elif featurizer == "atomic":
    # Pulled from PDB files. For larger datasets with more PDBs, would use
    # max num atoms instead of exact.
    frag1_num_atoms = 70  # for ligand atoms
    frag2_num_atoms = 24000  # for protein atoms
    complex_num_atoms = 24060  # in total
    complex_num_atoms = 24070  # in total
    max_num_neighbors = 4
    # Cutoff in angstroms
    neighbor_cutoff = 4
+49 −0
Original line number Diff line number Diff line
"""
Script that trains Atomic Conv models on PDBbind dataset.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import os
import deepchem as dc
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from deepchem.molnet import load_pdbbind

# For stable runs
np.random.seed(123)

pdbbind_tasks, pdbbind_datasets, transformers = load_pdbbind(
    featurizer="atomic", split="random", subset="core")
train_dataset, valid_dataset, test_dataset = pdbbind_datasets

metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)

frag1_num_atoms = 70  # for ligand atoms
frag2_num_atoms = 24000  # for protein atoms
complex_num_atoms = frag1_num_atoms + frag2_num_atoms
atomic_convnet = atomic_conv.AtomicConvModel(
    batch_size=batch_size,
    frag1_num_atoms=frag1_num_atoms,
    frag2_num_atoms=frag2_num_atoms,
    complex_num_atoms=complex_num_atoms)

# Fit trained model
print("Fitting model on train dataset")
model.fit(train_dataset)
model.save()

print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)
Loading