Commit ca7335fd authored by miaecle's avatar miaecle
Browse files

Merge remote-tracking branch 'remotes/mine/molnet'

parents a59f9b3e 517bf7b7
Loading
Loading
Loading
Loading
+2 −3
Original line number Diff line number Diff line
@@ -333,9 +333,8 @@ class TensorGraph(Model):
        feed_dict[self._training_placeholder] = 0.0
        feed_results = self.session.run(outputs, feed_dict=feed_dict)
        if len(feed_results) > 1:
          if len(transformers):
            raise ValueError("Does not support transformations "
                             "for multiple outputs.")
          result = undo_transforms(np.stack(feed_results, 1), transformers)
          feed_results = [result[:, i] for i in range(result.shape[1])]
        elif len(feed_results) == 1:
          result = undo_transforms(feed_results[0], transformers)
          feed_results = [result]

delaney_dag_valid.py

0 → 100644
+58 −0
Original line number Diff line number Diff line
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import deepchem
import numpy as np
import tensorflow as tf

seed = 123
np.random.seed(seed)

tasks, datasets, transformers = deepchem.molnet.load_delaney(featurizer='GraphConv', split='random', reload=False)
train_dataset, valid_dataset, test_dataset = datasets
metric = [deepchem.metrics.Metric(deepchem.metrics.rms_score, np.mean)]

max_atoms_train = max([mol.get_num_atoms() for mol in train_dataset.X])
max_atoms_valid = max([mol.get_num_atoms() for mol in valid_dataset.X])
max_atoms_test = max([mol.get_num_atoms() for mol in test_dataset.X])
max_atoms = max([max_atoms_train, max_atoms_valid, max_atoms_test])

reshard_size = 512
transformer = deepchem.trans.DAGTransformer(max_atoms=max_atoms)
train_dataset.reshard(reshard_size)
train_dataset = transformer.transform(train_dataset)
valid_dataset.reshard(reshard_size)
valid_dataset = transformer.transform(valid_dataset)
test_dataset.reshard(reshard_size)
test_dataset = transformer.transform(test_dataset)

batch_size = 128
nb_epoch = 1000
learning_rate = 0.0005
n_graph_feat = 23

tf.set_random_seed(seed)
model = deepchem.models.DAGTensorGraph(
               1, 
               max_atoms=55,
               n_atom_feat=75,
               n_graph_feat=n_graph_feat,
               mode='regression',
               batch_size=batch_size,
               leanring_rate=learning_rate,
               use_queue=False)

model.fit(train_dataset, nb_epoch=nb_epoch)
train_scores = model.evaluate(train_dataset, metric, transformers)
valid_scores = model.evaluate(valid_dataset, metric, transformers)
test_scores = model.evaluate(test_dataset, metric, transformers)

""" Expected Results:
  train_scores: {'mean-rms_score': 0.029829638487211169}
    
  valid_scores: {'mean-rms_score': 0.75142478279661051}
    
  test_scores: {'mean-rms_score': 0.53192168238754678}

"""
 No newline at end of file
+37 −0
Original line number Diff line number Diff line
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import deepchem
import numpy as np
import tensorflow as tf

seed = 123
np.random.seed(seed)

tasks, datasets, transformers = deepchem.molnet.load_delaney(featurizer='GraphConv', split='random', reload=False)
train_dataset, valid_dataset, test_dataset = datasets
metric = [deepchem.metrics.Metric(deepchem.metrics.rms_score, np.mean)]
  
batch_size = 150
nb_epoch = 1000
learning_rate = 0.0008

tf.set_random_seed(seed)
model = deepchem.models.GraphConvTensorGraph(1, mode='regression',
                                             batch_size=batch_size,
                                             leanring_rate=learning_rate)

model.fit(train_dataset, nb_epoch=nb_epoch)
train_scores = model.evaluate(train_dataset, metric, transformers)
valid_scores = model.evaluate(valid_dataset, metric, transformers)
test_scores = model.evaluate(test_dataset, metric, transformers)

""" Expected Results:
  train_scores: {'mean-rms_score': 0.058672648022210311}
    
  valid_scores: {'mean-rms_score': 0.3635136142334261}
    
  test_scores: {'mean-rms_score': 0.35664025829369983}

"""
 No newline at end of file
+143 −0
Original line number Diff line number Diff line
"""
Created on Sat Oct 14 16:59:49 2017

@author: zqwu

This script evaluates how performances change with
different size of training set(training set fraction).

Default fractions evaluated are 0.1, 0.2, ..., 0.9.
The whole dataset is split into train set and valid set
with corresponding fractions.(test set is not used)
Models are trained on train set and evaluated on valid set.
Command line options are the same as `benchmark.py`

All results and train set fractions are stored in
'./results_frac_train_curve.csv'
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import deepchem as dc
import argparse
import pickle
import csv
from deepchem.molnet.run_benchmark_models import benchmark_classification, benchmark_regression
from deepchem.molnet.run_benchmark import load_dataset
from deepchem.molnet.check_availability import CheckFeaturizer, CheckSplit
from deepchem.molnet.preset_hyper_parameters import hps


# Evaluate performances with different training set fraction
frac_trains=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

parser = argparse.ArgumentParser(
    description='Deepchem benchmark: ' +
    'giving performances of different learning models on datasets')
parser.add_argument(
    '-s',
    action='append',
    dest='splitter_args',
    default=[],
    help='Choice of splitting function: index, random, scaffold, stratified')
parser.add_argument(
    '-m',
    action='append',
    dest='model_args',
    default=[],
    help='Choice of model: tf, tf_robust, logreg, rf, irv, graphconv, xgb,' + \
         ' dag, weave, tf_regression, tf_regression_ft, rf_regression, ' + \
         'graphconvreg, xgb_regression, dtnn, dag_regression, weave_regression')
parser.add_argument(
    '-d',
    action='append',
    dest='dataset_args',
    default=[],
    help='Choice of dataset: bace_c, bace_r, bbbp, chembl, clearance, ' +
    'clintox, delaney, hiv, hopv, kaggle, lipo, muv, nci, pcba, ' +
    'pdbbind, ppb, qm7, qm7b, qm8, qm9, sampl, sider, tox21, toxcast')
parser.add_argument(
    '--seed',
    action='append',
    dest='seed_args',
    default=[],
    help='Choice of random seed')

args = parser.parse_args()
#Datasets and models used in the benchmark test
splitters = args.splitter_args
models = args.model_args
datasets = args.dataset_args

if len(args.seed_args) > 0:
  seed = int(args.seed_args[0])
else:
  seed = 123

out_path = '.'
for dataset in datasets:
  for split in splitters:
    for model in models:

      hyper_parameters = None
      # Uncomment the two lines below if hyper_parameters are provided
      #with open(os.path.join(out_path, dataset + model + '.pkl'), 'r') as f:
      #  hyper_parameters = pickle.load(f)

      if dataset in [
          'bace_c', 'bbbp', 'clintox', 'hiv', 'muv', 'pcba', 'sider', 'tox21',
          'toxcast'
      ]:
        mode = 'classification'
        metric = [dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)]
      elif dataset in [
          'bace_r', 'chembl', 'clearance', 'delaney', 'hopv', 'kaggle', 'lipo',
          'nci', 'pdbbind', 'ppb', 'qm7', 'qm7b', 'qm8', 'qm9', 'sampl'
      ]:
        mode = 'regression'
        metric = [dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)]

      pair = (dataset, model)
      if pair in CheckFeaturizer:
        featurizer = CheckFeaturizer[pair][0]
        n_features = CheckFeaturizer[pair][1]

      tasks, all_dataset, transformers = load_dataset(dataset, featurizer, split='index')
      all_dataset = dc.data.DiskDataset.merge(all_dataset)
      for frac_train in frac_trains:
        splitters = {
          'index': dc.splits.IndexSplitter(),
          'random': dc.splits.RandomSplitter(),
          'scaffold': dc.splits.ScaffoldSplitter(),
          'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=0)
        }
        splitter = splitters[split]
        np.random.seed(seed)
        train, valid, test = splitter.train_valid_test_split(all_dataset,
                                                             frac_train=frac_train,
                                                             frac_valid=1-frac_train,
                                                             frac_test=0.)
        test = valid
	if mode == 'classification':
          train_score, valid_score, test_score = benchmark_classification(
              train, valid, test, tasks, transformers, n_features, metric,
              model, test=False, hyper_parameters=hyper_parameters, seed=seed)
	elif mode == 'regression':
          train_score, valid_score, test_score = benchmark_regression(
              train, valid, test, tasks, transformers, n_features, metric,
              model, test=False, hyper_parameters=hyper_parameters, seed=seed)
        with open(os.path.join(out_path, 'results_frac_train_curve.csv'), 'a') as f:
          writer = csv.writer(f)
          model_name = list(train_score.keys())[0]
          for i in train_score[model_name]:
            output_line = [
              dataset, str(split), mode, model_name, i, 'train',
              train_score[model_name][i], 'valid', valid_score[model_name][i]
            ]
            output_line.extend([
                'frac_train', frac_train])
            writer.writerow(output_line)

qm9_dtnn_test.py

0 → 100644
+89 −0
Original line number Diff line number Diff line
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import deepchem
import numpy as np
import tensorflow as tf
import tempfile

data_dir = deepchem.utils.get_data_dir()
dataset_file = os.path.join(data_dir, "gdb9.sdf")

qm9_tasks = ["u0_atom"]
featurizer = deepchem.feat.CoulombMatrix(29)

loader = deepchem.data.SDFLoader(
        tasks=qm9_tasks,
        smiles_field="smiles",
        mol_field="mol",
        featurizer=featurizer)

dataset = loader.featurize(dataset_file)
splitter = deepchem.splits.RandomSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
      dataset)

transformers = [
      deepchem.trans.NormalizationTransformer(
          transform_y=True, dataset=train_dataset)
]
for transformer in transformers:
  train_dataset = transformer.transform(train_dataset)
  valid_dataset = transformer.transform(valid_dataset)
  test_dataset = transformer.transform(test_dataset)

metric = [deepchem.metrics.Metric(deepchem.metrics.mean_absolute_error, np.mean)]
  
batch_size = 49
nb_epoch = 100
learning_rate = 0.0003
n_embedding = 42
n_distance = 173

seed = 123
np.random.seed(seed)
tf.set_random_seed(seed)
model_dir = tempfile.mkdtemp()
model = deepchem.models.DTNNTensorGraph(
    len(qm9_tasks),
    n_embedding=n_embedding,
    n_hidden=60,
    n_distance=n_distance,
    distance_min=-1.,
    distance_max=18.,
    output_activation=False,
    batch_size=batch_size,
    learning_rate=learning_rate,
    use_queue=False,
    mode="regression",
    model_dir=model_dir)
model.fit(train_dataset, nb_epoch=nb_epoch)
for rate in [learning_rate/5, learning_rate/20, learning_rate/100]:
  model = deepchem.models.DTNNTensorGraph(
      len(qm9_tasks),
      n_embedding=n_embedding,
      n_hidden=60,
      n_distance=n_distance,
      distance_min=-1.,
      distance_max=18.,
      output_activation=False,
      batch_size=batch_size,
      learning_rate=learning_rate/5,
      use_queue=False,
      mode="regression",
      model_dir=model_dir)
  model.restore()
  model.fit(train_dataset, nb_epoch=10)

train_scores = model.evaluate(train_dataset, metric, transformers)
valid_scores = model.evaluate(valid_dataset, metric, transformers)
test_scores = model.evaluate(test_dataset, metric, transformers)

model.fit(train_dataset, nb_epoch=10)
'''
computed_metrics: [0.95282979862675088]
computed_metrics: [1.1501283330568968]
computed_metrics: [1.2601717317672092]
'''
 No newline at end of file
Loading