Commit 40e80761 authored by ZHENQIN WU's avatar ZHENQIN WU
Browse files

refractor changes

parent 104dfbec
Loading
Loading
Loading
Loading
+4 −16
Original line number Diff line number Diff line
@@ -43,25 +43,13 @@ def load_gdb7(featurizer=None, split='random'):
  # Featurize gdb7 dataset
  print("About to featurize gdb7 dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  gdb7_tasks = ["u0_atom"]
  if featurizer == None:
  dataset_file = os.path.join(
      current_dir, "./gdb7.sdf")
  gdb7_tasks = ["u0_atom"]
  if featurizer == None:
    featurizer = dc.feat.CoulombMatrix(23)
    loader = dc.data.SDFLoader(tasks=gdb7_tasks, 
        mol_field="mol", smiles_field="smiles", featurizer=featurizer)
  elif featurizer == 'ECFP':
    dataset_file = os.path.join(
      current_dir, "./gdb7_smiles_withoutH.csv")
    featurizer = dc.feat.CircularFingerprint(size=1024)
    loader = dc.data.CSVLoader(
        tasks=gdb7_tasks, smiles_field="smiles", featurizer=featurizer)
  elif featurizer == 'GraphConv':
    dataset_file = os.path.join(
      current_dir, "./gdb7_smiles_withoutH.csv")
    featurizer = dc.feat.ConvMolFeaturizer()
    loader = dc.data.CSVLoader(tasks=gdb7_tasks, smiles_field="smiles", 
                             featurizer=featurizer)
  loader = dc.data.SDFLoader(tasks=gdb7_tasks, smiles_filed="smiles",
                             mol_field="mol", featurizer=featurizer)
  dataset = loader.featurize(dataset_file)
 
  # Initialize transformers 

examples/gdb7/gdb7_graph_conv.py

deleted100644 → 0
+0 −66
Original line number Diff line number Diff line
"""
Script that trains graph-conv models on Tox21 dataset.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import numpy as np
import tensorflow as tf
import deepchem as dc
from keras import backend as K
from gdb7_datasets import load_gdb7

# Only for debug!
np.random.seed(123)

g = tf.Graph()
sess = tf.Session(graph=g)
K.set_session(sess)

with g.as_default():
  # Load Tox21 dataset
  tf.set_random_seed(123)
  gdb7_tasks, gdb7_datasets, transformers = load_gdb7(featurizer='GraphConv', split='indice')
  train_dataset, valid_dataset, test_dataset = gdb7_datasets

  # Fit models
  metric = [dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression"), 
            dc.metrics.Metric(dc.metrics.pearson_r2_score, mode="regression")]

  # Do setup required for tf/keras models
  # Number of features on conv-mols
  n_feat = 75
  # Batch size of models
  batch_size = 128
  graph_model = dc.nn.SequentialGraph(n_feat)
  graph_model.add(dc.nn.GraphConv(128, activation='relu'))
  graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
  graph_model.add(dc.nn.GraphPool())
  graph_model.add(dc.nn.GraphConv(128, activation='relu'))
  graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
  graph_model.add(dc.nn.GraphPool())
  # Gather Projection
  graph_model.add(dc.nn.Dense(256, activation='relu'))
  graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
  graph_model.add(dc.nn.GraphGather(batch_size, activation="tanh"))
  # Dense post-processing layer

  with tf.Session() as sess:
    model = dc.models.MultitaskGraphRegressor(
      sess, graph_model, len(gdb7_tasks), batch_size=batch_size,
      learning_rate=1e-3, learning_rate_decay_time=1000,
      optimizer_type="adam", beta1=.9, beta2=.999)

    # Fit trained model
    model.fit(train_dataset, nb_epoch=20)

    print("Evaluating model")
    train_scores = model.evaluate(train_dataset, metric, transformers)
    valid_scores = model.evaluate(valid_dataset, metric, transformers)

    print("Train scores")
    print(train_scores)

    print("Validation scores")
    print(valid_scores)
+0 −6835

File deleted.

Preview size limit exceeded, changes collapsed.

examples/gdb7/gdb7_tf.py

deleted100644 → 0
+0 −42
Original line number Diff line number Diff line
"""
Script that trains multitask models on gdb7 dataset.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import shutil
import numpy as np
import deepchem as dc
from gdb7_datasets import load_gdb7

# Only for debug!
np.random.seed(123)

# Load gdb7 dataset
n_features = 1024
gdb7_tasks, gdb7_datasets, transformers = load_gdb7(featurizer='ECFP', split='indice')
train_dataset, valid_dataset, test_dataset = gdb7_datasets

# Fit models
metric = [dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression"), 
          dc.metrics.Metric(dc.metrics.pearson_r2_score, mode="regression")]

model = dc.models.TensorflowMultiTaskRegressor(
    len(gdb7_tasks), n_features, layer_sizes=[1000], dropouts=[.25],
    learning_rate=0.001, batch_size=50, verbosity="high")

# Fit trained model
model.fit(train_dataset)
model.save()

print("Evaluating model")
train_scores = model.evaluate(train_dataset, metric, transformers)
valid_scores = model.evaluate(valid_dataset, metric, transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)
+2 −6
Original line number Diff line number Diff line
@@ -9,17 +9,13 @@ import os
import deepchem as dc
import numpy as np
from gdb7_datasets import load_gdb7_from_mat
from gdb7_datasets import load_gdb7

np.random.seed(123)
split = 0
num_atoms = 23

#gdb7_tasks, datasets, transformers = load_gdb7_from_mat(split)
#train_dataset, test_dataset = datasets

gdb7_tasks, datasets, transformers = load_gdb7(split='indice')
train_dataset, test_dataset, _ = datasets
gdb7_tasks, datasets, transformers = load_gdb7_from_mat(split)
train_dataset, test_dataset = datasets

fit_transformers = [dc.trans.CoulombFitTransformer(train_dataset)]
regression_metric = [dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression"),