Commit 1cbf4dfb authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Factors update

parent 29efdb3b
Loading
Loading
Loading
Loading
+2 −9
Original line number Diff line number Diff line
@@ -30,10 +30,6 @@ def remove_missing_entries(dataset):

def get_transformers(train_dataset):
  """Get transformers applied to datasets."""
  #transformers = [
  #    dc.trans.LogTransformer(transform_X=True),
  #    dc.trans.NormalizationTransformer(transform_y=True,
  #                                      dataset=train_dataset)]
  transformers = []
  return transformers

@@ -56,13 +52,10 @@ def gen_factors(FACTORS_tasks, raw_train_dir, train_dir, valid_dir, test_dir,
  train_dataset = loader.featurize(train_files, shard_size=shard_size)

  print("Featurizing valid datasets")
  valid_dataset = loader.featurize(
      valid_files, shard_size=shard_size)
  valid_dataset = loader.featurize(valid_files, shard_size=shard_size)

  print("Featurizing test datasets")
  print("Creating test dataset")
  test_dataset = loader.featurize(
      test_files, shard_size=shard_size)
  test_dataset = loader.featurize(test_files, shard_size=shard_size)

  print("Remove missing entries from datasets.")
  remove_missing_entries(train_dataset)
+2 −0
Original line number Diff line number Diff line
@@ -75,6 +75,8 @@ for trial in range(num_trials):
  print("Mean Test score")
  print(test_score)

print("####################################################################")

for trial in range(num_trials):
  (train_score, train_task_scores, valid_score, valid_task_scores,
   test_score, test_task_scores) = all_results[trial]
+3 −1
Original line number Diff line number Diff line
@@ -18,7 +18,7 @@ np.random.seed(123)

###Load data###
shard_size = 2000
num_trials = 1
num_trials = 5
print("About to load FACTORS data.")
FACTORS_tasks, datasets, transformers = load_factors(shard_size=shard_size)
train_dataset, valid_dataset, test_dataset = datasets
@@ -76,6 +76,8 @@ for trial in range(num_trials):
  print("Mean Test score")
  print(test_score)

print("####################################################################")

for trial in range(num_trials):
  (train_score, train_task_scores, valid_score, valid_task_scores,
   test_score, test_task_scores) = all_results[trial]
+68 −0
Original line number Diff line number Diff line
"""
Script that trains Tensorflow Robust Multitask models on FACTORS datasets.
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import tempfile
import shutil
import deepchem as dc
from MERCK_datasets import load_factors

# Set numpy seed
np.random.seed(123)

###Load data###
shard_size = 2000
num_cores = 1
num_shards_per_batch = 4
print("About to load MERCK data.")
FACTORS_tasks, datasets, transformers = load_factors(
    shard_size=shard_size, num_shards_per_batch=num_shards_per_batch)
train_dataset, valid_dataset, test_dataset = datasets

print("Number of compounds in train set")
print(len(train_dataset))
print("Number of compounds in validation set")
print(len(valid_dataset))
print("Number of compounds in test set")
print(len(test_dataset))

n_layers = 3
n_bypass_layers = 3
#nb_epoch = 150
nb_epoch = 125
model = dc.models.RobustMultitaskRegressor(
    len(FACTORS_tasks), train_dataset.get_data_shape()[0],
    layer_sizes=[1000]*n_layers, bypass_layer_sizes=[100]*n_bypass_layers,
    dropouts=[.25]*n_layers, bypass_dropouts=[.25]*n_bypass_layers, 
    weight_init_stddevs=[.02]*n_layers, bias_init_consts=[1.]*n_layers,
    bypass_weight_init_stddevs=[.02]*n_bypass_layers,
    bypass_bias_init_consts=[1.]*n_bypass_layers,
    learning_rate=.0003, penalty=.0001, penalty_type="l2",
    optimizer="adam", batch_size=100, seed=123, verbosity="high")

#Use R2 classification metric
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean,
                           mode="regression")

print("Fitting Model")
model.fit(train_dataset, nb_epoch=nb_epoch)

train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
#Only use for final evaluation
test_scores = model.evaluate(test_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)

print("Test scores")
print(test_scores)
+60 −23
Original line number Diff line number Diff line
@@ -10,18 +10,17 @@ import numpy as np
import tempfile
import shutil
import deepchem as dc
from MERCK_datasets import load_factors
from FACTORS_datasets import load_factors

# Set numpy seed
np.random.seed(123)

###Load data###
shard_size = 2000
num_shards_per_batch = 4
num_trials = 5

print("About to load FACTORS data.")
FACTORS_tasks, datasets, transformers = load_factors(
    shard_size=shard_size, num_shards_per_batch=num_shards_per_batch)
FACTORS_tasks, datasets, transformers = load_factors(shard_size=shard_size)
train_dataset, valid_dataset, test_dataset = datasets

print("Number of compounds in train set")
@@ -31,11 +30,12 @@ print(len(valid_dataset))
print("Number of compounds in test set")
print(len(test_dataset))

metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean)

###Create model###
n_layers = 3
#nb_epoch = 50
#nb_epoch = 125
nb_epoch = 10
#nb_epoch = 1
nb_epoch = 125
n_features = train_dataset.get_data_shape()[0]
def task_model_builder(m_dir):
  return dc.models.TensorflowMultiTaskRegressor(
@@ -43,25 +43,62 @@ def task_model_builder(m_dir):
      layer_sizes=[1000]*n_layers, dropouts=[.25]*n_layers,
      weight_init_stddevs=[.02]*n_layers, bias_init_consts=[1.]*n_layers,
      learning_rate=.0003, penalty=.0001, penalty_type="l2", optimizer="adam",
      batch_size=100, seed=123, verbosity="high")
model = dc.models.SingletaskToMultitask(FACTORS_tasks, task_model_builder)
      batch_size=100, seed=123)

###Evaluate models###
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean)
all_results = []
for trial in range(num_trials):
  print("Starting trial %d" % trial)
  model = dc.models.SingletaskToMultitask(FACTORS_tasks, task_model_builder)

  print("Fitting Model")
  model.fit(train_dataset, nb_epoch=nb_epoch)

train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
#Only use for final evaluation
test_scores = model.evaluate(test_dataset, [metric], transformers)

print("Train scores")
print(train_scores)
  print("Evaluating models")
  train_score, train_task_scores = model.evaluate(
      train_dataset, [metric], transformers, per_task_metrics=True)
  valid_score, valid_task_scores = model.evaluate(
      valid_dataset, [metric], transformers, per_task_metrics=True)
  test_score, test_task_scores = model.evaluate(
      test_dataset, [metric], transformers, per_task_metrics=True)

  all_results.append((train_score, train_task_scores,
                      valid_score, valid_task_scores,
                      test_score, test_task_scores))

  print("----------------------------------------------------------------")
  print("Scores for trial %d" % trial)
  print("----------------------------------------------------------------")
  print("train_task_scores")
  print(train_task_scores)
  print("Mean Train score")
  print(train_score)
  print("valid_task_scores")
  print(valid_task_scores)
  print("Mean Validation score")
  print(valid_score)
  print("test_task_scores")
  print(test_task_scores)
  print("Mean Test score")
  print(test_score)

print("Validation scores")
print(valid_scores)
print("####################################################################")

print("Test scores")
print(test_scores)
for trial in range(num_trials):
  (train_score, train_task_scores, valid_score, valid_task_scores,
   test_score, test_task_scores) = all_results[trial]
  print("----------------------------------------------------------------")
  print("Scores for trial %d" % trial)
  print("----------------------------------------------------------------")
  print("train_task_scores")
  print(train_task_scores)
  print("Mean Train score")
  print(train_score)
  print("valid_task_scores")
  print(valid_task_scores)
  print("Mean Validation score")
  print(valid_score)
  print("test_task_scores")
  print(test_task_scores)
  print("Mean Test score")
  print(test_score)