Commit 1a0076e6 authored by miaecle's avatar miaecle
Browse files

adding SAMPL

parent 294805bb
Loading
Loading
Loading
Loading
+8 −10
Original line number Diff line number Diff line
@@ -351,20 +351,18 @@ class IndiceSplitter(Splitter):
    """
    num_datapoints = len(dataset)
    indices = np.arange(num_datapoints).tolist()
    train_indices = []
    if self.valid_indices is None:
      self.valid_indices = []
    else:
      for indice in indices:
        if indice in self.valid_indices:
          indices.remove(indice)
    if self.test_indices is None:
      self.test_indices = []
    else:
    valid_test = self.valid_indices
    valid_test.extend(self.test_indices)
    for indice in indices:
        if indice in self.valid_indices:
          indices.remove(indice)
      if not indice in valid_test:
        train_indices.append(indice)
    
    return (indices, self.valid_indices, self.test_indices)
    return (train_indices, self.valid_indices, self.test_indices)


class ScaffoldSplitter(Splitter):
+644 −0

File added.

Preview size limit exceeded, changes collapsed.

+43 −0
Original line number Diff line number Diff line
"""
SAMPL dataset loader.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import shutil
import deepchem as dc

def load_SAMPL(featurizer='ECFP', split='index'):
  """Load SAMPL datasets."""
  # Featurize SAMPL dataset
  print("About to featurize SAMPL dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "./SAMPL.csv")
  SAMPL_tasks = ['expt', 'calc']
  if featurizer == 'ECFP':
    featurizer = dc.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = dc.feat.ConvMolFeaturizer()
  loader = dc.data.CSVLoader(
      tasks=SAMPL_tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(
      dataset_file, shard_size=8192)

  # Initialize transformers 
  transformers = [
      dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)]

  print("About to transform data")
  for transformer in transformers:
      dataset = transformer.transform(dataset)

  splitters = {'index': dc.splits.IndexSplitter(),
               'random': dc.splits.RandomSplitter(),
               'scaffold': dc.splits.ScaffoldSplitter()}
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
  return SAMPL_tasks, (train, valid, test), transformers
+68 −0
Original line number Diff line number Diff line
"""
Script that trains graph-conv models on Tox21 dataset.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import numpy as np
import tensorflow as tf
import deepchem as dc
from keras import backend as K
from SAMPL_datasets import load_SAMPL

# Only for debug!
np.random.seed(123)

g = tf.Graph()
sess = tf.Session(graph=g)
K.set_session(sess)

with g.as_default():
  # Load Tox21 dataset
  tf.set_random_seed(123)
  SAMPL_tasks, SAMPL_datasets, transformers = load_SAMPL(featurizer='GraphConv')
  train_dataset, valid_dataset, test_dataset = SAMPL_datasets

  # Fit models
  metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)

  # Do setup required for tf/keras models
  # Number of features on conv-mols
  n_feat = 75
  # Batch size of models
  batch_size = 128
  graph_model = dc.nn.SequentialGraph(n_feat)
  graph_model.add(dc.nn.GraphConv(128, activation='relu'))
  graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
  graph_model.add(dc.nn.GraphPool())
  graph_model.add(dc.nn.GraphConv(128, activation='relu'))
  graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
  graph_model.add(dc.nn.GraphPool())
  # Gather Projection
  graph_model.add(dc.nn.Dense(256, activation='relu'))
  graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
  graph_model.add(dc.nn.GraphGather(batch_size, activation="tanh"))
  # Dense post-processing layer

  with tf.Session() as sess:
    model = dc.models.MultitaskGraphRegressor(
      sess, graph_model, len(SAMPL_tasks), batch_size=batch_size,
      learning_rate=1e-3, learning_rate_decay_time=1000,
      optimizer_type="adam", beta1=.9, beta2=.999)

    # Fit trained model
    model.fit(train_dataset, nb_epoch=20)

    print("Evaluating model")
    train_scores = model.evaluate(train_dataset, [metric], transformers)
    valid_scores = model.evaluate(valid_dataset, [metric], transformers)

    print("Train scores")
    print(train_scores)

    print("Validation scores")
    print(valid_scores)
    pred = model.predict(train_dataset, transformers)
    pred2 = model.predict(valid_dataset, transformers)
    pred3 = model.predict(test_dataset, transformers)
+41 −0
Original line number Diff line number Diff line
"""
Script that trains multitask models on SAMPL dataset.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import shutil
import numpy as np
import deepchem as dc
from SAMPL_datasets import load_SAMPL

# Only for debug!
np.random.seed(123)

# Load SAMPL dataset
n_features = 1024
SAMPL_tasks, SAMPL_datasets, transformers = load_SAMPL()
train_dataset, valid_dataset, test_dataset = SAMPL_datasets

# Fit models
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)

model = dc.models.TensorflowMultiTaskRegressor(
    len(SAMPL_tasks), n_features, layer_sizes=[1000], dropouts=[.25],
    learning_rate=0.001, batch_size=50, verbosity="high")

# Fit trained model
model.fit(train_dataset)
model.save()

print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)
Loading