Commit 48d8d6b5 authored by leswing's avatar leswing
Browse files

Cleaned UP

parent 5019ad5d
Loading
Loading
Loading
Loading
+8 −4
Original line number Diff line number Diff line
@@ -134,10 +134,12 @@ def get_atom_adj_matrices(mol,
  return (adj_matrix.astype(np.uint8), atom_matrix.astype(np.uint8))


def featurize_mol(mol, n_atom_types, max_n_atoms, max_valence):

def featurize_mol(mol, n_atom_types, max_n_atoms, max_valence,
                  num_atoms_feature):
  adj_matrix, atom_matrix = get_atom_adj_matrices(mol, n_atom_types,
                                                  max_n_atoms, max_valence)
  if num_atoms_feature:
    return ((adj_matrix, atom_matrix, mol.GetNumAtoms()))
  return ((adj_matrix, atom_matrix))


@@ -147,11 +149,13 @@ class AdjacencyFingerprint(Featurizer):
               n_atom_types=23,
               max_n_atoms=200,
               add_hydrogens=False,
               max_valence=4):
               max_valence=4,
               num_atoms_feature=False):
    self.n_atom_types = n_atom_types
    self.max_n_atoms = max_n_atoms
    self.add_hydrogens = add_hydrogens
    self.max_valence = max_valence
    self.num_atoms_feature = num_atoms_feature

  def featurize(self, rdkit_mols):
    featurized_mols = np.empty((len(rdkit_mols)), dtype=object)
@@ -160,6 +164,6 @@ class AdjacencyFingerprint(Featurizer):
      if self.add_hydrogens:
        mol = Chem.AddHs(mol)
      featurized_mol = featurize_mol(mol, self.n_atom_types, self.max_n_atoms,
                                     self.max_valence)
                                     self.max_valence, self.num_atoms_feature)
      featurized_mols[idx] = featurized_mol
    return (featurized_mols)
+149 −3
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@ import tensorflow as tf
import numpy as np

from deepchem.nn import model_ops, initializations, regularizers, activations
import math


class Layer(object):
@@ -3112,3 +3113,148 @@ class BetaShare(Layer):

  def set_tensors(self, tensor):
    self.out_tensor, self.betas = tensor


class PassThroughLayer(Layer):

  def __init__(self, output_num, **kwargs):
    self.output_num = output_num
    super(PassThroughLayer, self).__init__(**kwargs)

  def create_tensor(self, in_layers=None, set_tensors=True, **kwargs):
    self.out_tensor = self.in_layers[0].out_tensors[self.output_num]


class GraphCNNPoolLayer(Layer):

  def __init__(self, num_vertices, **kwargs):
    self.num_vertices = num_vertices
    super(GraphCNNPoolLayer, self).__init__(**kwargs)

  def create_tensor(self, in_layers=None, set_tensors=True, **kwargs):
    """
    TODO(LESWING) self.num_vertices = 1
    Parameters
    ----------
    in_layers
    set_tensors
    kwargs

    Returns
    -------

    """
    in_tensors = self._get_input_tensors(in_layers)
    if len(in_tensors) == 3:
      V, A, mask = in_tensors
    else:
      V, A = in_tensors
      mask = None
    factors = self.embedding_factors(
        V, self.num_vertices, name='%s_Factors' % self.name)

    if mask is not None:
      factors = tf.multiply(factors, mask)
    factors = self.softmax_factors(factors)

    result = tf.matmul(factors, V, transpose_a=True)

    result_A = tf.reshape(A, (tf.shape(A)[0], -1, tf.shape(A)[-1]))
    result_A = tf.matmul(result_A, factors)
    result_A = tf.reshape(result_A, (tf.shape(A)[0], tf.shape(A)[-1], -1))
    result_A = tf.matmul(factors, result_A, transpose_a=True)
    result_A = tf.reshape(result_A, (tf.shape(A)[0], self.num_vertices,
                                     A.get_shape()[2].value, self.num_vertices))
    # We do not need the mask because every graph has self.num_vertices vertices now
    # result = make_bn(result, True, mask=None, name="%s_bn" % self.name)
    self.out_tensors = [result, result_A, factors]

  def embedding_factors(self, V, no_filters, name="default"):
    no_features = V.get_shape()[-1].value
    W = tf.get_variable(
        '%s_weights' % name, [no_features, no_filters],
        initializer=tf.truncated_normal_initializer(
            stddev=1.0 / math.sqrt(no_features)),
        dtype=tf.float32)
    b = tf.get_variable(
        '%s_bias' % self.name, [no_filters],
        initializer=tf.constant_initializer(0.1),
        dtype=tf.float32)
    V_reshape = tf.reshape(V, (-1, no_features))
    s = tf.slice(tf.shape(V), [0], [len(V.get_shape()) - 1])
    s = tf.concat([s, tf.stack([no_filters])], 0)
    result = tf.reshape(tf.matmul(V_reshape, W) + b, s)
    return result

  def softmax_factors(self, V, axis=1, name=None):
    max_value = tf.reduce_max(V, axis=axis, keep_dims=True)
    exp = tf.exp(tf.subtract(V, max_value))
    prob = tf.div(exp, tf.reduce_sum(exp, axis=axis, keep_dims=True))
    return prob


def GraphCNNPool(num_vertices, **kwargs):
  gcnnpool_layer = GraphCNNPoolLayer(num_vertices, **kwargs)
  return [PassThroughLayer(x, in_layers=gcnnpool_layer) for x in range(3)]


class GraphCNNLayer(Layer):

  def __init__(self, num_filters, **kwargs):
    self.num_filters = num_filters
    super(GraphCNNLayer, self).__init__(**kwargs)

  def create_tensor(self, in_layers=None, set_tensors=True, **kwargs):
    inputs = self._get_input_tensors(in_layers)
    if len(inputs) == 3:
      V, A, mask = inputs
    else:
      V, A = inputs
      mask = None
    no_A = A.get_shape()[2].value
    no_features = V.get_shape()[2].value
    W = tf.get_variable(
        '%s_weights' % self.name, [no_features * no_A, self.num_filters],
        initializer=tf.truncated_normal_initializer(stddev=math.sqrt(
            1.0 / (no_features * (no_A + 1) * 1.0))),
        dtype=tf.float32)
    W_I = tf.get_variable(
        '%s_weights_I' % self.name, [no_features, self.num_filters],
        initializer=tf.truncated_normal_initializer(stddev=math.sqrt(
            1.0 / (no_features * (no_A + 1) * 1.0))),
        dtype=tf.float32)

    b = tf.get_variable(
        '%s_bias' % self.name, [self.num_filters],
        initializer=tf.constant_initializer(0.1),
        dtype=tf.float32)

    n = self.graphConvolution(V, A)
    A_shape = tf.shape(A)
    n = tf.reshape(n, [-1, A_shape[1], no_A * no_features])
    result = self.batch_mat_mult(n, W) + self.batch_mat_mult(V, W_I) + b
    if set_tensors:
      self.out_tensor = result
    return result

  def graphConvolution(self, V, A):
    no_A = A.get_shape()[2].value
    no_features = V.get_shape()[2].value

    A_shape = tf.shape(A)
    A_reshape = tf.reshape(A, tf.stack([-1, A_shape[1] * no_A, A_shape[1]]))
    n = tf.matmul(A_reshape, V)
    return tf.reshape(n, [-1, A_shape[1], no_A, no_features])

  def batch_mat_mult(self, A, B):
    A_shape = tf.shape(A)
    A_reshape = tf.reshape(A, [-1, A_shape[-1]])

    # So the Tensor has known dimensions
    if B.get_shape()[1] == None:
      axis_2 = -1
    else:
      axis_2 = B.get_shape()[1]
    result = tf.matmul(A_reshape, B)
    result = tf.reshape(result, tf.stack([A_shape[0], A_shape[1], axis_2]))
    return result
+153 −11
Original line number Diff line number Diff line
import math

import numpy as np
import six
import tensorflow as tf

from deepchem.data import NumpyDataset
from deepchem.feat.graph_features import ConvMolFeaturizer
from deepchem.feat.mol_graphs import ConvMol
from deepchem.metrics import to_one_hot, from_one_hot
from deepchem.metrics import to_one_hot
from deepchem.models.tensorgraph.graph_layers import WeaveLayer, WeaveGather, \
  Combine_AP, Separate_AP, DTNNEmbedding, DTNNStep, DTNNGather, DAGLayer, \
  DAGGather, DTNNExtract, MessagePassing, SetGather
from deepchem.models.tensorgraph.layers import Dense, Concat, SoftMax, \
  SoftMaxCrossEntropy, GraphConv, BatchNorm, \
  GraphPool, GraphGather, WeightedError, Dropout, BatchNormalization, Stack
  GraphPool, GraphGather, WeightedError, Dropout, BatchNormalization, Stack, Layer, Flatten, GraphCNNLayer, GraphCNNPool
from deepchem.models.tensorgraph.layers import L2Loss, Label, Weights, Feature
from deepchem.models.tensorgraph.tensor_graph import TensorGraph
from deepchem.trans import undo_transforms
from deepchem.utils.evaluate import GeneratorEvaluator
from deepchem.data import NumpyDataset
from deepchem.data.data_loader import featurize_smiles_np
from deepchem.feat.graph_features import ConvMolFeaturizer


class WeaveTensorGraph(TensorGraph):
@@ -487,6 +486,149 @@ class DAGTensorGraph(TensorGraph):
        yield feed_dict


class PetroskiSuchTensorGraph(TensorGraph):

  def __init__(self,
               n_tasks,
               max_atoms=200,
               dropout=0.2,
               mode="classification",
               **kwargs):
    """
    Parameters
    ----------
    n_tasks: int
      Number of tasks
    mode: str
      Either "classification" or "regression"
    """
    self.n_tasks = n_tasks
    self.mode = mode
    self.max_atoms = max_atoms
    self.error_bars = True if 'error_bars' in kwargs and kwargs['error_bars'] else False
    self.dropout = dropout
    kwargs['use_queue'] = False
    super(PetroskiSuchTensorGraph, self).__init__(**kwargs)
    self.build_graph()

  def build_graph(self):
    self.vertex_features = Feature(shape=(None, self.max_atoms, 75))
    self.adj_matrix = Feature(shape=(None, self.max_atoms, 1, self.max_atoms))
    self.mask = Feature(shape=(None, self.max_atoms, 1))

    gcnn1 = BatchNorm(
        GraphCNNLayer(
            num_filters=64,
            in_layers=[self.vertex_features, self.adj_matrix, self.mask]))
    gcnn1 = Dropout(self.dropout, in_layers=gcnn1)
    gcnn2 = BatchNorm(
        GraphCNNLayer(
            num_filters=64, in_layers=[gcnn1, self.adj_matrix, self.mask]))
    gcnn2 = Dropout(self.dropout, in_layers=gcnn2)
    gc_pool, adj_matrix, factors = GraphCNNPool(
        num_vertices=32, in_layers=[gcnn2, self.adj_matrix, self.mask])
    gc_pool = BatchNorm(gc_pool)
    gc_pool = Dropout(self.dropout, in_layers=gc_pool)
    gcnn3 = BatchNorm(
        GraphCNNLayer(num_filters=32, in_layers=[gc_pool, adj_matrix]))
    gcnn3 = Dropout(self.dropout, in_layers=gcnn3)
    gc_pool2, adj_matrix2, factors = GraphCNNPool(
        num_vertices=8, in_layers=[gcnn3, adj_matrix])
    gc_pool2 = BatchNorm(gc_pool2)
    gc_pool2 = Dropout(self.dropout, in_layers=gc_pool2)
    flattened = Flatten(in_layers=gc_pool2)
    readout = Dense(
        out_channels=256, activation_fn=tf.nn.relu, in_layers=flattened)
    costs = []
    self.my_labels = []
    for task in range(self.n_tasks):
      if self.mode == 'classification':
        classification = Dense(
            out_channels=2, activation_fn=None, in_layers=[readout])

        softmax = SoftMax(in_layers=[classification])
        self.add_output(softmax)

        label = Label(shape=(None, 2))
        self.my_labels.append(label)
        cost = SoftMaxCrossEntropy(in_layers=[label, classification])
        costs.append(cost)
      if self.mode == 'regression':
        regression = Dense(
            out_channels=1, activation_fn=None, in_layers=[readout])
        self.add_output(regression)

        label = Label(shape=(None, 1))
        self.my_labels.append(label)
        cost = L2Loss(in_layers=[label, regression])
        costs.append(cost)
    if self.mode == "classification":
      entropy = Concat(in_layers=costs, axis=-1)
    elif self.mode == "regression":
      entropy = Stack(in_layers=costs, axis=1)
    self.my_task_weights = Weights(shape=(None, self.n_tasks))
    loss = WeightedError(in_layers=[entropy, self.my_task_weights])
    self.set_loss(loss)

  def default_generator(self,
                        dataset,
                        epochs=1,
                        predict=False,
                        deterministic=True,
                        pad_batches=True):
    for epoch in range(epochs):
      if not predict:
        print('Starting epoch %i' % epoch)
      for ind, (X_b, y_b, w_b, ids_b) in enumerate(
          dataset.iterbatches(
              self.batch_size, pad_batches=True, deterministic=deterministic)):
        d = {}
        for index, label in enumerate(self.my_labels):
          if self.mode == 'classification':
            d[label] = to_one_hot(y_b[:, index])
          if self.mode == 'regression':
            d[label] = np.expand_dims(y_b[:, index], -1)
        d[self.my_task_weights] = w_b
        d[self.adj_matrix] = np.expand_dims(np.array([x[0] for x in X_b]), -2)
        d[self.vertex_features] = np.array([x[1] for x in X_b])
        mask = np.zeros(shape=(self.batch_size, self.max_atoms, 1))
        for i in range(self.batch_size):
          mask_size = X_b[i][2]
          mask[i][:mask_size][0] = 1
        d[self.mask] = mask
        yield d

  def predict_proba_on_generator(self, generator, transformers=[]):
    if not self.built:
      self.build()
    with self._get_tf("Graph").as_default():
      out_tensors = [x.out_tensor for x in self.outputs]
      results = []
      for feed_dict in generator:
        feed_dict = {
            self.layers[k.name].out_tensor: v
            for k, v in six.iteritems(feed_dict)
        }
        feed_dict[self._training_placeholder] = 1.0  ##
        result = np.array(self.session.run(out_tensors, feed_dict=feed_dict))
        if len(result.shape) == 3:
          result = np.transpose(result, axes=[1, 0, 2])
        if len(transformers) > 0:
          result = undo_transforms(result, transformers)
        results.append(result)
      return np.concatenate(results, axis=0)

  def evaluate(self, dataset, metrics, transformers=[], per_task_metrics=False):
    if not self.built:
      self.build()
    return self.evaluate_generator(
        self.default_generator(dataset, predict=True),
        metrics,
        labels=self.my_labels,
        weights=[self.my_task_weights],
        per_task_metrics=per_task_metrics)


class GraphConvTensorGraph(TensorGraph):

  def __init__(self, n_tasks, mode="classification", **kwargs):
+38 −13
Original line number Diff line number Diff line
@@ -6,40 +6,65 @@ from __future__ import division
from __future__ import unicode_literals

import os

import deepchem
import numpy as np
import shutil
import deepchem as dc
from deepchem.data import DiskDataset


def load_tox21(featurizer='ECFP', split='index'):
  """Load Tox21 datasets. Does not do train/test split"""
  # Featurize Tox21 dataset
  print("About to featurize Tox21 dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "../../datasets/tox21.csv.gz")
  tox21_tasks = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER',
                 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5',
                 'SR-HSE', 'SR-MMP', 'SR-p53']

  dataset_file = os.path.join(current_dir, "../../datasets/tox21.csv.gz")
  data_dir = deepchem.utils.get_data_dir()

  tox21_tasks = [
      'NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD',
      'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'
  ]

  dataset_dir = os.path.join(data_dir, "tox21", featurizer, split)
  train, valid, test = os.path.join(dataset_dir, 'train'), os.path.join(
      dataset_dir, 'valid'), os.path.join(dataset_dir, 'test')
  if os.path.isdir(dataset_dir):
    train, valid, test = DiskDataset(data_dir=train), DiskDataset(
        data_dir=valid), DiskDataset(data_dir=test)
    transformers = [
        dc.trans.BalancingTransformer(transform_w=True, dataset=train)
    ]
    return tox21_tasks, (train, valid, test), transformers
  if featurizer == 'ECFP':
    featurizer_func = dc.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer_func = dc.feat.ConvMolFeaturizer()
  elif featurizer == 'AdjMatrix':
    featurizer_func = dc.feat.AdjacencyFingerprint(num_atoms_feature=True)
  loader = dc.data.CSVLoader(
      tasks=tox21_tasks, smiles_field="smiles", featurizer=featurizer_func)
  dataset = loader.featurize(dataset_file, shard_size=8192)

  # Initialize transformers
  transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
  ]

  print("About to transform data")
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  splitters = {'index': dc.splits.IndexSplitter(),
  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
      'scaffold': dc.splits.ScaffoldSplitter(),
               'butina': dc.splits.ButinaSplitter()}
      'butina': dc.splits.ButinaSplitter()
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  train, valid, test = splitter.train_valid_test_split(
      dataset, train_dir=train, valid_dir=valid, test_dir=test)

  return tox21_tasks, (train, valid, test), transformers
+47 −0
Original line number Diff line number Diff line
"""
Script that trains graph-conv models on Tox21 dataset.
"""
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import numpy as np
import json

np.random.seed(123)
import tensorflow as tf

tf.set_random_seed(123)
import deepchem as dc
from tox21_datasets import load_tox21
from deepchem.models.tensorgraph.models.graph_models import PetroskiSuchTensorGraph

model_dir = "/tmp/graph_conv"

# Load Tox21 dataset
tox21_tasks, tox21_datasets, transformers = load_tox21(featurizer='AdjMatrix')
train_dataset, valid_dataset, test_dataset = tox21_datasets
print(train_dataset.data_dir)
print(valid_dataset.data_dir)

# Fit models
metric = dc.metrics.Metric(
    dc.metrics.roc_auc_score, np.mean, mode="classification")

# Batch size of models
batch_size = 128

model = PetroskiSuchTensorGraph(
    len(tox21_tasks), batch_size=batch_size, mode='classification')

model.fit(train_dataset, nb_epoch=10)

print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)