Commit bb95f8e9 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Initial delaney solubility models.

parent 5bd27fcd
Loading
Loading
Loading
Loading
+42 −0
Original line number Diff line number Diff line
"""
Delaney dataset loader.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import shutil
import deepchem as dc

def load_delaney(featurizer='ECFP'):
  """Load delaney datasets."""
  # Featurize Delaney dataset
  print("About to featurize Delaney dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "../../datasets/delaney-processed.csv")
  delaney_tasks = ['measured log solubility in mols per litre']
  if featurizer == 'ECFP':
    featurizer_func = dc.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer_func = dc.feat.ConvMolFeaturizer()
  loader = dc.load.DataLoader(
      tasks=delaney_tasks, smiles_field="smiles",
      featurizer=featurizer_func, verbosity = 'high')
  dataset = loader.featurize(
      dataset_file, shard_size=8192)

  # Initialize transformers 
  transformers = [
      dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)]

  print("About to transform data")
  for transformer in transformers:
      dataset = transformer.transform(dataset)

  splitter = dc.splits.IndexSplitter()
  train, valid, test = splitter.train_valid_test_split(dataset,
      compute_feature_statistics=False)
  return delaney_tasks, (train, valid, test), transformers
+41 −0
Original line number Diff line number Diff line
"""
Script that trains multitask models on Delaney dataset.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import shutil
import numpy as np
import deepchem as dc
from delaney_dataset import load_delaney

# Only for debug!
np.random.seed(123)

# Load Delaney dataset
n_features = 1024
delaney_tasks, delaney_datasets, transformers = load_delaney()
train_dataset, valid_dataset, test_dataset = delaney_datasets

# Fit models
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)

model = dc.models.TensorflowMultiTaskRegressor(
    len(delaney_tasks), n_features, layer_sizes=[1000], dropouts=[.25],
    learning_rate=0.001, batch_size=50, verbosity="high")

# Fit trained model
model.fit(train_dataset)
model.save()

print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)