Commit 04718a20 authored by Joseph Gomes's avatar Joseph Gomes
Browse files

Added in cross-validation splits from reference paper

parent c547856f
Loading
Loading
Loading
Loading
+27 −0
Original line number Diff line number Diff line
@@ -9,8 +9,35 @@ import os
import numpy as np
import shutil
import deepchem as dc
import scipy.io
import csv

def load_gdb7_from_mat(split=0):

  if not os.path.exists('qm7.mat'): os.system('wget http://www.quantum-machine.org/data/qm7.mat')
  dataset = scipy.io.loadmat('qm7.mat')
  
  P = dataset['P'][range(0,split)+range(split+1,5)].flatten()
  X = dataset['X'][P]
  y = dataset['T'][0,P]
  w = np.ones_like(y)
  train_dataset = dc.data.NumpyDataset(X, y, w, ids=None)
  
  Ptest = dataset['P'][split]
  X = dataset['X'][Ptest]
  y = dataset['T'][0,Ptest]
  w = np.ones_like(y)
  test_dataset = dc.data.NumpyDataset(X, y, w, ids=None)

  transformers = [dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]

  for transformer in transformers:
    train_dataset = transformer.transform(train_dataset)
    test_dataset = transformer.transform(test_dataset)

  gdb7_tasks = ["atomization_energy"]
  return gdb7_tasks, (train_dataset, test_dataset), transformers

def load_gdb7(featurizer=None, split='random'):
  """Load gdb7 datasets."""
  # Featurize gdb7 dataset
+15 −16
Original line number Diff line number Diff line
@@ -8,26 +8,25 @@ from __future__ import unicode_literals
import os
import deepchem as dc
import numpy as np
from gdb7_datasets import load_gdb7
from gdb7_datasets import load_gdb7_from_mat

np.random.seed(123)
split="random"
split = 0
num_atoms = 23
gdb7_tasks, datasets, transformers = load_gdb7(featurizer=dc.feat.CoulombMatrix(num_atoms), split=split)
train_dataset, valid_dataset, test_dataset = datasets
#fit_transformers = [dc.trans.CoulombRandomizationFitTransformer(), dc.trans.NormalizationFitTransformer()]
X = train_dataset.X
fit_transformers = [dc.trans.CoulombFitTransformer(X, num_atoms)]

gdb7_tasks, datasets, transformers = load_gdb7_from_mat(split)
train_dataset, test_dataset = datasets

fit_transformers = [dc.trans.CoulombFitTransformer(train_dataset.X, num_atoms)]
regression_metric = [dc.metrics.Metric(dc.metrics.mean_absolute_error, 
                                      mode="regression"), dc.metrics.Metric(dc.metrics.pearson_r2_score,
				      mode="regression")]
model = dc.models.TensorflowMultiTaskFitTransformRegressor(
    n_tasks=len(gdb7_tasks), n_features=23,
    learning_rate=.0002, momentum=.8, batch_size=512,
    weight_init_stddevs=[1/np.sqrt(2000),1/np.sqrt(800),1/np.sqrt(800),1/np.sqrt(1000)],
    bias_init_consts=[0.,0.,0.,0.], layer_sizes=[2000,800,800,1000], 
    dropouts=[0.1,0.1,0.1,0.1], fit_transformers=fit_transformers, n_random_samples=10, seed=123)
    n_tasks=1, n_features=23,
    learning_rate={0: 0.001, 500: 0.0025, 2500: 0.005, 12500: 0.01} , momentum=.8, batch_size=25,
    weight_init_stddevs=[1/np.sqrt(400),1/np.sqrt(100),1/np.sqrt(100)],
    bias_init_consts=[0.,0.,0.], layer_sizes=[400,100,100], 
    dropouts=[0.01,0.01,0.01], fit_transformers=fit_transformers, n_evals=10, seed=123)

# Fit trained model
model.fit(train_dataset, nb_epoch=50)
@@ -37,6 +36,6 @@ train_scores = model.evaluate(train_dataset, regression_metric, transformers)
print("Train scores [kcal/mol]")
print(train_scores)

valid_scores = model.evaluate(valid_dataset, regression_metric, transformers)
print("Validation scores [kcal/mol]")
print(valid_scores)
test_scores = model.evaluate(test_dataset, regression_metric, transformers)
print("Test scores [kcal/mol]")
print(test_scores)