Commit c011725e authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Factors files

parent 23e59acb
Loading
Loading
Loading
Loading
+43 −0
Original line number Diff line number Diff line
"""
Script that computes correlations of FACTORS tasks. 
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import tempfile
import shutil
import deepchem as dc
import pandas as pd
from FACTORS_datasets import load_factors

###Load data###
np.random.seed(123)
shard_size = 2000
print("About to load FACTORS data.")
FACTORS_tasks, datasets, transformers = load_factors(shard_size=shard_size)
train_dataset, valid_dataset, test_dataset = datasets

y_train = train_dataset.y
n_tasks = y_train.shape[1]

all_results = []
for task in range(n_tasks):
  y_task = y_train[:, task]
  task_results = []
  for other_task in range(n_tasks):
    if task == other_task:
      task_results.append(1.)
      continue
    y_other = y_train[:, other_task]
    r2 = dc.metrics.pearson_r2_score(y_task, y_other)
    #print("r2 for %s-%s is %f" % (task, other_task, r2))
    task_results.append(r2)
  print("Task %d" % task)
  print(task_results)
  all_results.append(task_results)
print("Writing results to factors_corr.csv")
df = pd.DataFrame(all_results)
df.to_csv("factors_corr.csv")
+63 −30
Original line number Diff line number Diff line
@@ -11,17 +11,16 @@ import tempfile
import shutil
import numpy as np
import deepchem as dc
from MERCK_datasets import load_factors
from FACTORS_datasets import load_factors

# Set numpy seed
np.random.seed(123)

###Load data###
shard_size = 2000
num_shards_per_batch = 4
print("About to load MERCK data.")
FACTORS_tasks, datasets, transformers = load_factors(
    shard_size=shard_size, num_shards_per_batch=num_shards_per_batch)
num_trials = 5
print("About to load FACTORS data.")
FACTORS_tasks, datasets, transformers = load_factors(shard_size=shard_size)
train_dataset, valid_dataset, test_dataset = datasets

print("Number of compounds in train set")
@@ -31,6 +30,8 @@ print(len(valid_dataset))
print("Number of compounds in test set")
print(len(test_dataset))

all_results = []
for trial in range(num_trials):
  ###Create model###
  n_layers = 3
  nb_epoch = 50
@@ -40,26 +41,58 @@ model = dc.models.ProgressiveMultitaskRegressor(
      alpha_init_stddevs=[.02]*n_layers, weight_init_stddevs=[.02]*n_layers,
      bias_init_consts=[1.]*n_layers, learning_rate=.0003,
      penalty=.0001, penalty_type="l2", optimizer="adam", batch_size=100,
    seed=123, verbosity="high")

      seed=123)

  #Use R2 classification metric
  metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean)

  print("Training model")
  model.fit(train_dataset, nb_epoch=nb_epoch)
#model.old_fit(train_dataset, nb_epoch=nb_epoch)

train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
#Only use for final evaluation
test_scores = model.evaluate(test_dataset, [metric], transformers)
  print("Evaluating models")
  train_score, train_task_scores = model.evaluate(
      train_dataset, [metric], transformers, per_task_metrics=True)
  valid_score, valid_task_scores = model.evaluate(
      valid_dataset, [metric], transformers, per_task_metrics=True)
  test_score, test_task_scores = model.evaluate(
      test_dataset, [metric], transformers, per_task_metrics=True)

  all_results.append((train_score, train_task_scores,
                      valid_score, valid_task_scores,
                      test_score, test_task_scores))

  print("Scores for trial %d" % trial)
  print("----------------------------------------------------------------")
  print("train_task_scores")
  print(train_task_scores)
  print("Mean Train score")
  print(train_score)
  print("valid_task_scores")
  print(valid_task_scores)
  print("Mean Validation score")
  print(valid_score)
  print("test_task_scores")
  print(test_task_scores)
  print("Mean Test score")
  print(test_score)

print("Train scores")
print(train_scores)
print("####################################################################")

print("Validation scores")
print(valid_scores)
for trial in range(num_trials):
  (train_score, train_task_scores, valid_score, valid_task_scores,
   test_score, test_task_scores) = all_results[trial]

print("Test scores")
print(test_scores)
  print("Scores for trial %d" % trial)
  print("----------------------------------------------------------------")
  print("train_task_scores")
  print(train_task_scores)
  print("Mean Train score")
  print(train_score)
  print("valid_task_scores")
  print(valid_task_scores)
  print("Mean Validation score")
  print(valid_score)
  print("test_task_scores")
  print(test_task_scores)
  print("Mean Test score")
  print(test_score)
+64 −31
Original line number Diff line number Diff line
"""
Script that trains Tensorflow Robust Multitask models on FACTORS datasets.
Script that trains Tensorflow Bypass Multitask models on FACTORS datasets.
"""

from __future__ import print_function
@@ -11,18 +11,16 @@ import numpy as np
import tempfile
import shutil
import deepchem as dc
from MERCK_datasets import load_factors
from FACTORS_datasets import load_factors

# Set numpy seed
np.random.seed(123)

###Load data###
shard_size = 2000
num_cores = 1
num_shards_per_batch = 4
print("About to load MERCK data.")
FACTORS_tasks, datasets, transformers = load_factors(
    shard_size=shard_size, num_shards_per_batch=num_shards_per_batch)
num_trials = 5
print("About to load FACTORS data.")
FACTORS_tasks, datasets, transformers = load_factors(shard_size=shard_size)
train_dataset, valid_dataset, test_dataset = datasets

print("Number of compounds in train set")
@@ -34,8 +32,13 @@ print(len(test_dataset))

n_layers = 3
n_bypass_layers = 3
#nb_epoch = 150
nb_epoch = 125

#Use R2 classification metric
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean)

all_results = []
for trial in range(num_trials):
  model = dc.models.RobustMultitaskRegressor(
      len(FACTORS_tasks), train_dataset.get_data_shape()[0],
      layer_sizes=[1000]*n_layers, bypass_layer_sizes=[100]*n_bypass_layers,
@@ -44,25 +47,55 @@ model = dc.models.RobustMultitaskRegressor(
      bypass_weight_init_stddevs=[.02]*n_bypass_layers,
      bypass_bias_init_consts=[1.]*n_bypass_layers,
      learning_rate=.0003, penalty=.0001, penalty_type="l2",
    optimizer="adam", batch_size=100, seed=123, verbosity="high")

#Use R2 classification metric
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean,
                           mode="regression")
      optimizer="adam", batch_size=100, seed=123)

  print("Fitting Model")
  model.fit(train_dataset, nb_epoch=nb_epoch)

train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
#Only use for final evaluation
test_scores = model.evaluate(test_dataset, [metric], transformers)
  print("Evaluating models")
  train_score, train_task_scores = model.evaluate(
      train_dataset, [metric], transformers, per_task_metrics=True)
  valid_score, valid_task_scores = model.evaluate(
      valid_dataset, [metric], transformers, per_task_metrics=True)
  test_score, test_task_scores = model.evaluate(
      test_dataset, [metric], transformers, per_task_metrics=True)

  all_results.append((train_score, train_task_scores,
                      valid_score, valid_task_scores,
                      test_score, test_task_scores))

  print("Scores for trial %d" % trial)
  print("----------------------------------------------------------------")
  print("train_task_scores")
  print(train_task_scores)
  print("Mean Train score")
  print(train_score)
  print("valid_task_scores")
  print(valid_task_scores)
  print("Mean Validation score")
  print(valid_score)
  print("test_task_scores")
  print(test_task_scores)
  print("Mean Test score")
  print(test_score)

print("Train scores")
print(train_scores)
print("####################################################################")

print("Validation scores")
print(valid_scores)
for trial in range(num_trials):
  (train_score, train_task_scores, valid_score, valid_task_scores,
   test_score, test_task_scores) = all_results[trial]

print("Test scores")
print(test_scores)
  print("Scores for trial %d" % trial)
  print("----------------------------------------------------------------")
  print("train_task_scores")
  print(train_task_scores)
  print("Mean Train score")
  print(train_score)
  print("valid_task_scores")
  print(valid_task_scores)
  print("Mean Validation score")
  print(valid_score)
  print("test_task_scores")
  print(test_task_scores)
  print("Mean Test score")
  print(test_score)
+1 −3
Original line number Diff line number Diff line
"""
Script that trains Tensorflow Multitask models on FACTORS dataset.
Script that trains Tensorflow Singletask models on FACTORS dataset.
"""
from __future__ import print_function
from __future__ import division
@@ -34,7 +34,6 @@ metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean)

###Create model###
n_layers = 3
#nb_epoch = 1
nb_epoch = 125
n_features = train_dataset.get_data_shape()[0]
def task_model_builder(m_dir):
@@ -53,7 +52,6 @@ for trial in range(num_trials):
  print("Fitting Model")
  model.fit(train_dataset, nb_epoch=nb_epoch)


  print("Evaluating models")
  train_score, train_task_scores = model.evaluate(
      train_dataset, [metric], transformers, per_task_metrics=True)