Commit e17f245a authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Kinase changes

parent c011725e
Loading
Loading
Loading
Loading
+43 −0
Original line number Diff line number Diff line
"""
Script that computes correlations of KINASE tasks. 
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import tempfile
import shutil
import deepchem as dc
import pandas as pd
from KINASE_datasets import load_kinase

###Load data###
np.random.seed(123)
shard_size = 2000
print("About to load KINASE data.")
KINASE_tasks, datasets, transformers = load_kinase(shard_size=shard_size)
train_dataset, valid_dataset, test_dataset = datasets

y_train = train_dataset.y
n_tasks = y_train.shape[1]

all_results = []
for task in range(n_tasks):
  y_task = y_train[:, task]
  task_results = []
  for other_task in range(n_tasks):
    if task == other_task:
      task_results.append(1.)
      continue
    y_other = y_train[:, other_task]
    r2 = dc.metrics.pearson_r2_score(y_task, y_other)
    print("r2 for %s-%s is %f" % (task, other_task, r2))
    task_results.append(r2)
  print("Task %d" % task)
  print(task_results)
  all_results.append(task_results)
print("Writing results to kinase_corr.csv")
df = pd.DataFrame(all_results)
df.to_csv("kinase_corr.csv")
+1 −5
Original line number Diff line number Diff line
@@ -30,11 +30,7 @@ def remove_missing_entries(dataset):

def get_transformers(train_dataset):
  """Get transformers applied to datasets."""
  #transformers = []
  transformers = [
      dc.trans.LogTransformer(transform_X=True),
      dc.trans.NormalizationTransformer(transform_y=True,
                                        dataset=train_dataset)]
  transformers = []
  return transformers

def gen_kinase(KINASE_tasks, raw_train_dir, train_dir, valid_dir, test_dir,
+3 −3
Original line number Diff line number Diff line
@@ -16,7 +16,8 @@ from KINASE_datasets import load_kinase
###Load data###
np.random.seed(123)
shard_size = 2000
num_trials = 5
#num_trials = 5
num_trials = 1
print("About to load KINASE data.")
KINASE_tasks, datasets, transformers = load_kinase(shard_size=shard_size)
train_dataset, valid_dataset, test_dataset = datasets
@@ -35,8 +36,7 @@ metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean)

def task_model_builder(model_dir):
  sklearn_model = RandomForestRegressor(
      #n_estimators=100, max_features=int(num_features/3),
      n_estimators=1, max_features=int(num_features/3),
      n_estimators=100, max_features=int(num_features/3),
      min_samples_split=5, n_jobs=-1)
  return dc.models.SklearnModel(sklearn_model, model_dir)

+67 −29
Original line number Diff line number Diff line
@@ -11,14 +11,16 @@ import tempfile
import shutil
import numpy as np
import deepchem as dc
from MERCK_datasets import load_kinase
from KINASE_datasets import load_kinase

# Set numpy seed
np.random.seed(123)

###Load data###
shard_size = 2000
num_shards_per_batch = 4
print("About to load MERCK data.")
KINASE_tasks, datasets, transformers = load_kinase(
    shard_size=shard_size, num_shards_per_batch=num_shards_per_batch)
num_trials = 5
print("About to load KINASE data.")
KINASE_tasks, datasets, transformers = load_kinase(shard_size=shard_size)
train_dataset, valid_dataset, test_dataset = datasets

print("Number of compounds in train set")
@@ -28,6 +30,8 @@ print(len(valid_dataset))
print("Number of compounds in test set")
print(len(test_dataset))

all_results = []
for trial in range(num_trials):
  ###Create model###
  n_layers = 3
  nb_epoch = 50
@@ -43,18 +47,52 @@ model = dc.models.TensorflowMultiTaskRegressor(
  metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean)

  print("Training model")
model.fit(train_dataset, nb_epoch=nb_epoch)
  model.fit(train_dataset, nb_epoch=nb_epoch, max_checkpoints_to_keep=1)

  print("Evaluating models")
  train_score, train_task_scores = model.evaluate(
      train_dataset, [metric], transformers, per_task_metrics=True)
  valid_score, valid_task_scores = model.evaluate(
      valid_dataset, [metric], transformers, per_task_metrics=True)
  test_score, test_task_scores = model.evaluate(
      test_dataset, [metric], transformers, per_task_metrics=True)

  all_results.append((train_score, train_task_scores,
                      valid_score, valid_task_scores,
                      test_score, test_task_scores))

train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
#Only use for final evaluation
test_scores = model.evaluate(test_dataset, [metric], transformers)
  print("Scores for trial %d" % trial)
  print("----------------------------------------------------------------")
  print("train_task_scores")
  print(train_task_scores)
  print("Mean Train score")
  print(train_score)
  print("valid_task_scores")
  print(valid_task_scores)
  print("Mean Validation score")
  print(valid_score)
  print("test_task_scores")
  print(test_task_scores)
  print("Mean Test score")
  print(test_score)

print("Train scores")
print(train_scores)
print("####################################################################")

print("Validation scores")
print(valid_scores)
for trial in range(num_trials):
  (train_score, train_task_scores, valid_score, valid_task_scores,
   test_score, test_task_scores) = all_results[trial]

print("Test scores")
print(test_scores)
  print("Scores for trial %d" % trial)
  print("----------------------------------------------------------------")
  print("train_task_scores")
  print(train_task_scores)
  print("Mean Train score")
  print(train_score)
  print("valid_task_scores")
  print(valid_task_scores)
  print("Mean Validation score")
  print(valid_score)
  print("test_task_scores")
  print(test_task_scores)
  print("Mean Test score")
  print(test_score)
+63 −30
Original line number Diff line number Diff line
@@ -11,17 +11,16 @@ import tempfile
import shutil
import numpy as np
import deepchem as dc
from MERCK_datasets import load_kinase
from KINASE_datasets import load_kinase

# Set numpy seed
np.random.seed(123)

###Load data###
shard_size = 2000
num_shards_per_batch = 4
print("About to load MERCK data.")
KINASE_tasks, datasets, transformers = load_kinase(
    shard_size=shard_size, num_shards_per_batch=num_shards_per_batch)
num_trials = 5
print("About to load KINASE data.")
KINASE_tasks, datasets, transformers = load_kinase(shard_size=shard_size)
train_dataset, valid_dataset, test_dataset = datasets

print("Number of compounds in train set")
@@ -31,6 +30,8 @@ print(len(valid_dataset))
print("Number of compounds in test set")
print(len(test_dataset))

all_results = []
for trial in range(num_trials):
  ###Create model###
  n_layers = 3
  nb_epoch = 50
@@ -40,26 +41,58 @@ model = dc.models.ProgressiveMultitaskRegressor(
      alpha_init_stddevs=[.02]*n_layers, weight_init_stddevs=[.02]*n_layers,
      bias_init_consts=[1.]*n_layers, learning_rate=.0003,
      penalty=.0001, penalty_type="l2", optimizer="adam", batch_size=100,
    seed=123, verbosity="high")

      seed=123)

  #Use R2 classification metric
  metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean)

  print("Training model")
  model.fit(train_dataset, nb_epoch=nb_epoch)
#model.old_fit(train_dataset, nb_epoch=nb_epoch)

train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
#Only use for final evaluation
test_scores = model.evaluate(test_dataset, [metric], transformers)
  print("Evaluating models")
  train_score, train_task_scores = model.evaluate(
      train_dataset, [metric], transformers, per_task_metrics=True)
  valid_score, valid_task_scores = model.evaluate(
      valid_dataset, [metric], transformers, per_task_metrics=True)
  test_score, test_task_scores = model.evaluate(
      test_dataset, [metric], transformers, per_task_metrics=True)

  all_results.append((train_score, train_task_scores,
                      valid_score, valid_task_scores,
                      test_score, test_task_scores))

  print("Scores for trial %d" % trial)
  print("----------------------------------------------------------------")
  print("train_task_scores")
  print(train_task_scores)
  print("Mean Train score")
  print(train_score)
  print("valid_task_scores")
  print(valid_task_scores)
  print("Mean Validation score")
  print(valid_score)
  print("test_task_scores")
  print(test_task_scores)
  print("Mean Test score")
  print(test_score)

print("Train scores")
print(train_scores)
print("####################################################################")

print("Validation scores")
print(valid_scores)
for trial in range(num_trials):
  (train_score, train_task_scores, valid_score, valid_task_scores,
   test_score, test_task_scores) = all_results[trial]

print("Test scores")
print(test_scores)
  print("Scores for trial %d" % trial)
  print("----------------------------------------------------------------")
  print("train_task_scores")
  print(train_task_scores)
  print("Mean Train score")
  print(train_score)
  print("valid_task_scores")
  print(valid_task_scores)
  print("Mean Validation score")
  print(valid_score)
  print("test_task_scores")
  print(test_task_scores)
  print("Mean Test score")
  print(test_score)
Loading