Commit 03c26773 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #442 from joegomes/qm8

QM8 Dataset + Example
parents b38bcdcc 6798f1ba
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
echo "Pulling qm8 dataset from deepchem"
wget http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/gdb8.tar.gz
echo "Extracting qm8 structures"
tar -zxvf gdb8.tar.gz
+52 −0
Original line number Diff line number Diff line
"""
qm8 dataset loader.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import shutil
import deepchem as dc


def load_qm8(featurizer=None, split='random'):
  """Load qm8 datasets."""
  # Featurize qm8 dataset
  print("About to featurize qm8 dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(current_dir, "./qm8.sdf")
  if not os.path.exists(dataset_file):
    os.system('sh ' + current_dir + '/get_qm8.sh')

  qm8_tasks = [
      "E1-CC2", "E2-CC2", "f1-CC2", "f2-CC2", "E1-PBE0", "E2-PBE0", "f1-PBE0",
      "f2-PBE0", "E1-PBE0", "E2-PBE0", "f1-PBE0", "f2-PBE0", "E1-CAM", "E2-CAM",
      "f1-CAM", "f2-CAM"
  ]
  if featurizer is None:
    featurizer = dc.feat.CoulombMatrix(26)
  loader = dc.data.SDFLoader(
      tasks=qm8_tasks,
      smiles_field="smiles",
      mol_field="mol",
      featurizer=featurizer)
  dataset = loader.featurize(dataset_file)
  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
      'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=0)
  }
  splitter = splitters[split]
  train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
      dataset)
  transformers = [
      dc.trans.NormalizationTransformer(
          transform_y=True, dataset=train_dataset)
  ]
  for transformer in transformers:
    train_dataset = transformer.transform(train_dataset)
    valid_dataset = transformer.transform(valid_dataset)
    test_dataset = transformer.transform(test_dataset)
  return qm8_tasks, (train_dataset, valid_dataset, test_dataset), transformers
+49 −0
Original line number Diff line number Diff line
"""
Script that trains Tensorflow multitask models on QM8 dataset.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import deepchem as dc
import numpy as np
from qm8_datasets import load_qm8

np.random.seed(123)
qm8_tasks, datasets, transformers = load_qm8()
train_dataset, valid_dataset, test_dataset = datasets
fit_transformers = [dc.trans.CoulombFitTransformer(train_dataset)]
regression_metric = [
    dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression"),
    dc.metrics.Metric(dc.metrics.pearson_r2_score, mode="regression")
]
model = dc.models.TensorflowMultiTaskFitTransformRegressor(
    n_tasks=len(qm8_tasks),
    n_features=[26, 26],
    learning_rate=0.001,
    momentum=.8,
    batch_size=32,
    weight_init_stddevs=[1 / np.sqrt(400), 1 / np.sqrt(100), 1 / np.sqrt(100)],
    bias_init_consts=[0., 0., 0.],
    layer_sizes=[400, 100, 100],
    dropouts=[0.01, 0.01, 0.01],
    fit_transformers=fit_transformers,
    n_evals=10,
    seed=123)

# Fit trained model
model.fit(train_dataset, nb_epoch=50)
model.save()

train_scores = model.evaluate(train_dataset, regression_metric, transformers)
print("Train scores [kcal/mol]")
print(train_scores)

valid_scores = model.evaluate(valid_dataset, regression_metric, transformers)
print("Valid scores [kcal/mol]")
print(valid_scores)

test_scores = model.evaluate(test_dataset, regression_metric, transformers)
print("Test scores [kcal/mol]")
print(test_scores)