Commit 29efdb3b authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Kaggle changes

parent 26d11c98
Loading
Loading
Loading
Loading
+57 −18
Original line number Diff line number Diff line
@@ -16,9 +16,9 @@ from kaggle_datasets import load_kaggle
###Load data###
np.random.seed(123)
shard_size = 2000
num_trials = 5
print("About to load KAGGLE data.")
KAGGLE_tasks, datasets, transformers = load_kaggle(
    shard_size=shard_size)
KAGGLE_tasks, datasets, transformers = load_kaggle(shard_size=shard_size)
train_dataset, valid_dataset, test_dataset = datasets

print("Number of compounds in train set")
@@ -31,29 +31,68 @@ print(len(test_dataset))
num_features = train_dataset.get_data_shape()[0]
print("Num features: %d" % num_features)

metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean)

def task_model_builder(model_dir):
  sklearn_model = RandomForestRegressor(
      n_estimators=100, max_features=int(num_features/3),
      #n_estimators=100, max_features=int(num_features/3),
      n_estimators=1, max_features=int(num_features/3),
      min_samples_split=5, n_jobs=-1)
  return dc.models.SklearnModel(sklearn_model, model_dir)
model = dc.models.SingletaskToMultitask(KAGGLE_tasks, task_model_builder)

###Evaluate models###
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean)
all_results = []
for trial in range(num_trials):
  print("Starting trial %d" % trial)
  model = dc.models.SingletaskToMultitask(KAGGLE_tasks, task_model_builder)

  print("Training model")
  model.fit(train_dataset)

train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
#Only use for final evaluation
test_scores = model.evaluate(test_dataset, [metric], transformers)
  print("Evaluating models")
  train_score, train_task_scores = model.evaluate(
      train_dataset, [metric], transformers, per_task_metrics=True)
  valid_score, valid_task_scores = model.evaluate(
      valid_dataset, [metric], transformers, per_task_metrics=True)
  test_score, test_task_scores = model.evaluate(
      test_dataset, [metric], transformers, per_task_metrics=True)

  all_results.append((train_score, train_task_scores,
                      valid_score, valid_task_scores,
                      test_score, test_task_scores))

print("Train scores")
print(train_scores)
  print("----------------------------------------------------------------")
  print("Scores for trial %d" % trial)
  print("----------------------------------------------------------------")
  print("train_task_scores")
  print(train_task_scores)
  print("Mean Train score")
  print(train_score)
  print("valid_task_scores")
  print(valid_task_scores)
  print("Mean Validation score")
  print(valid_score)
  print("test_task_scores")
  print(test_task_scores)
  print("Mean Test score")
  print(test_score)

print("Validation scores")
print(valid_scores)
print("####################################################################")

print("Test scores")
print(test_scores)
for trial in range(num_trials):
  (train_score, train_task_scores, valid_score, valid_task_scores,
   test_score, test_task_scores) = all_results[trial]
  print("----------------------------------------------------------------")
  print("Scores for trial %d" % trial)
  print("----------------------------------------------------------------")
  print("train_task_scores")
  print(train_task_scores)
  print("Mean Train score")
  print(train_score)
  print("valid_task_scores")
  print(valid_task_scores)
  print("Mean Validation score")
  print(valid_score)
  print("test_task_scores")
  print(test_task_scores)
  print("Mean Test score")
  print(test_score)
+65 −0
Original line number Diff line number Diff line
"""
Script that trains Tensorflow Progressive Multitask models on KAGGLE datasets.
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import tempfile
import shutil
import numpy as np
import deepchem as dc
from MERCK_datasets import load_kaggle

# Set numpy seed
np.random.seed(123)

###Load data###
shard_size = 2000
num_shards_per_batch = 4
print("About to load MERCK data.")
KAGGLE_tasks, datasets, transformers = load_kaggle(
    shard_size=shard_size, num_shards_per_batch=num_shards_per_batch)
train_dataset, valid_dataset, test_dataset = datasets

print("Number of compounds in train set")
print(len(train_dataset))
print("Number of compounds in validation set")
print(len(valid_dataset))
print("Number of compounds in test set")
print(len(test_dataset))

###Create model###
n_layers = 3
nb_epoch = 50
model = dc.models.ProgressiveMultitaskRegressor(
    len(KAGGLE_tasks), train_dataset.get_data_shape()[0],
    layer_sizes=[100]*n_layers, dropouts=[.25]*n_layers,
    alpha_init_stddevs=[.02]*n_layers, weight_init_stddevs=[.02]*n_layers,
    bias_init_consts=[1.]*n_layers, learning_rate=.0003,
    penalty=.0001, penalty_type="l2", optimizer="adam", batch_size=100,
    seed=123, verbosity="high")


#Use R2 classification metric
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean)

print("Training model")
model.fit(train_dataset, nb_epoch=nb_epoch)
#model.old_fit(train_dataset, nb_epoch=nb_epoch)

train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
#Only use for final evaluation
test_scores = model.evaluate(test_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)

print("Test scores")
print(test_scores)
+67 −0
Original line number Diff line number Diff line
"""
Script that trains Tensorflow Robust Multitask models on KAGGLE datasets.
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import tempfile
import shutil
import deepchem as dc
from MERCK_datasets import load_kaggle

# Set numpy seed
np.random.seed(123)

###Load data###
shard_size = 2000
num_shards_per_batch = 4
print("About to load MERCK data.")
KAGGLE_tasks, datasets, transformers = load_kaggle(
    shard_size=shard_size, num_shards_per_batch=num_shards_per_batch)
train_dataset, valid_dataset, test_dataset = datasets

print("KAGGLE_tasks")
print(KAGGLE_tasks)
print("Number of compounds in train set")
print(len(train_dataset))
print("Number of compounds in validation set")
print(len(valid_dataset))
print("Number of compounds in test set")
print(len(test_dataset))

n_layers = 3
n_bypass_layers = 3
nb_epoch = 100
model = dc.models.RobustMultitaskRegressor(
    len(KAGGLE_tasks), train_dataset.get_data_shape()[0],
    layer_sizes=[2000, 1000, 1000], bypass_layer_sizes=[200]*n_bypass_layers,
    dropouts=[.25]*n_layers, bypass_dropouts=[.25]*n_bypass_layers, 
    weight_init_stddevs=[.02]*n_layers, bias_init_consts=[1.]*n_layers,
    bypass_weight_init_stddevs=[.02]*n_bypass_layers,
    bypass_bias_init_consts=[1.]*n_bypass_layers,
    learning_rate=.00003, penalty=.0004, penalty_type="l2",
    optimizer="adam", batch_size=100, seed=123, verbosity="high")

#Use R2 classification metric
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean)

print("Fitting Model")
model.fit(train_dataset, nb_epoch=nb_epoch)

train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
#Only use for final evaluation
test_scores = model.evaluate(test_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)

print("Test scores")
print(test_scores)
+65 −0
Original line number Diff line number Diff line
"""
Script that trains Tensorflow Singletask models on KAGGLE dataset.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import tempfile
import shutil
import deepchem as dc
from MERCK_datasets import load_kaggle

# Set numpy seed
np.random.seed(123)

###Load data###
shard_size = 2000
num_shards_per_batch = 4

print("About to load KAGGLE data.")
KAGGLE_tasks, datasets, transformers = load_kaggle(
    shard_size=shard_size, num_shards_per_batch=num_shards_per_batch)
train_dataset, valid_dataset, test_dataset = datasets

print("Number of compounds in train set")
print(len(train_dataset))
print("Number of compounds in validation set")
print(len(valid_dataset))
print("Number of compounds in test set")
print(len(test_dataset))

###Create model###
n_layers = 3
nb_epoch = 100
n_features = train_dataset.get_data_shape()[0]
def task_model_builder(m_dir):
  return dc.models.TensorflowMultiTaskRegressor(
      n_tasks=1, n_features=n_features, logdir=m_dir,
      layer_sizes=[2000, 1000, 1000], dropouts=[.25]*n_layers,
      weight_init_stddevs=[.02]*n_layers, bias_init_consts=[1.]*n_layers,
      learning_rate=.00003, penalty=.0004, penalty_type="l2", optimizer="adam",
      batch_size=100, seed=123, verbosity="high")
model = dc.models.SingletaskToMultitask(KAGGLE_tasks, task_model_builder)

###Evaluate models###
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean)

print("Fitting Model")
model.fit(train_dataset, nb_epoch=nb_epoch)

train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
#Only use for final evaluation
test_scores = model.evaluate(test_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)

print("Test scores")
print(test_scores)
+58 −23
Original line number Diff line number Diff line
@@ -10,8 +10,6 @@ import shutil
import time
import numpy as np
import deepchem as dc
import sys
sys.path.append(".")
from kaggle_features import merck_descriptors 

def remove_missing_entries(dataset):
@@ -30,42 +28,44 @@ def remove_missing_entries(dataset):
    ids = ids[available_rows]
    dataset.set_shard(i, X, y, w, ids)

def get_transformers(train_dataset):
  """Get transformers applied to datasets."""
  transformers = []
  #transformers = [
  #    dc.trans.LogTransformer(transform_X=True),
  #    dc.trans.NormalizationTransformer(transform_y=True,
  #                                      dataset=train_dataset)]
  return transformers


# Set shard size low to avoid memory problems.
def load_kaggle(shard_size=2000, featurizer=None):
def gen_kaggle(KAGGLE_tasks, raw_train_dir, train_dir, valid_dir, test_dir,
                shard_size=2000):
  """Load KAGGLE datasets. Does not do train/test split"""
  ############################################################## TIMING
  time1 = time.time()
  ############################################################## TIMING
  # Set some global variables up top
  current_dir = os.path.dirname(os.path.realpath(__file__))
  train_files = os.path.join(current_dir,
      "KAGGLE_training_disguised_combined_full.csv.gz")
  valid_files = os.path.join(current_dir,
      "KAGGLE_test1_disguised_combined_full.csv.gz")
  test_files = os.path.join(current_dir,
      "KAGGLE_test2_disguised_combined_full.csv.gz")
  train_files = ("KAGGLE_training_disguised_combined_full.csv.gz")
  valid_files = ("KAGGLE_test1_disguised_combined_full.csv.gz")
  test_files = ("KAGGLE_test2_disguised_combined_full.csv.gz")

  # Featurize KAGGLE dataset
  print("About to featurize KAGGLE dataset.")
  featurizer = dc.feat.UserDefinedFeaturizer(merck_descriptors)
  KAGGLE_tasks = ['3A4', 'CB1', 'DPP4', 'HIVINT', 'HIV_PROT', 'LOGD', 'METAB',
                  'NK1', 'OX1', 'OX2', 'PGP', 'PPB', 'RAT_F', 'TDI',
                  'THROMBIN']

  loader = dc.data.UserCSVLoader(
      tasks=KAGGLE_tasks, id_field="Molecule", featurizer=featurizer)

  train_datasets, valid_datasets, test_datasets = [], [], []
  print("Featurizing train datasets")
  train_dataset = loader.featurize(
      train_files, shard_size=shard_size)
  train_dataset = loader.featurize(train_files, shard_size=shard_size)

  print("Featurizing valid datasets")
  valid_dataset = loader.featurize(
      valid_files, shard_size=shard_size)
  valid_dataset = loader.featurize(valid_files, shard_size=shard_size)

  print("Featurizing test datasets")
  test_dataset = loader.featurize(
      test_files, shard_size=shard_size)
  test_dataset = loader.featurize(test_files, shard_size=shard_size)

  print("Remove missing entries from datasets.")
  remove_missing_entries(train_dataset)
@@ -73,14 +73,13 @@ def load_kaggle(shard_size=2000, featurizer=None):
  remove_missing_entries(test_dataset)

  print("Transforming datasets with transformers.")
  transformers = [
      dc.trans.LogTransformer(transform_X=True),
      dc.trans.NormalizationTransformer(transform_y=True,
                                        dataset=train_dataset)]
  transformers = get_transformers(train_dataset)
  raw_train_dataset = train_dataset

  for transformer in transformers:
    print("Performing transformations with %s"
          % transformer.__class__.__name__)
    print("Transforming datasets")
    train_dataset = transformer.transform(train_dataset)
    valid_dataset = transformer.transform(valid_dataset)
    test_dataset = transformer.transform(test_dataset)
@@ -88,9 +87,45 @@ def load_kaggle(shard_size=2000, featurizer=None):
  print("Shuffling order of train dataset.")
  train_dataset.sparse_shuffle()

  print("Moving directories")
  raw_train_dataset.move(raw_train_dir)
  train_dataset.move(train_dir)
  valid_dataset.move(valid_dir)
  test_dataset.move(test_dir)

  ############################################################## TIMING
  time2 = time.time()
  print("TIMING: KAGGLE fitting took %0.3f s" % (time2-time1))
  ############################################################## TIMING
  
  return (raw_train_dataset, train_dataset, valid_dataset, test_dataset)

def load_kaggle(shard_size):
  """Loads kaggle datasets. Generates if not stored already."""
  KAGGLE_tasks = ['3A4', 'CB1', 'DPP4', 'HIVINT', 'HIV_PROT', 'LOGD', 'METAB',
                  'NK1', 'OX1', 'OX2', 'PGP', 'PPB', 'RAT_F', 'TDI',
                  'THROMBIN']

  current_dir = os.path.dirname(os.path.realpath(__file__))
  raw_train_dir = os.path.join(current_dir, "raw_train_dir")
  train_dir = os.path.join(current_dir, "train_dir") 
  valid_dir = os.path.join(current_dir, "valid_dir") 
  test_dir = os.path.join(current_dir, "test_dir") 

  if (os.path.exists(raw_train_dir) and
      os.path.exists(train_dir) and
      os.path.exists(valid_dir) and
      os.path.exists(test_dir)):
    print("Reloading existing datasets")
    raw_train_dataset = dc.data.DiskDataset(raw_train_dir)
    train_dataset = dc.data.DiskDataset(train_dir)
    valid_dataset = dc.data.DiskDataset(valid_dir)
    test_dataset = dc.data.DiskDataset(test_dir)
  else:
    print("Featurizing datasets")
    (raw_train_dataset, train_dataset, valid_dataset, test_dataset) = \
      gen_kaggle(KAGGLE_tasks, raw_train_dir, train_dir, valid_dir, test_dir,
                  shard_size=shard_size)

  transformers = get_transformers(raw_train_dataset)
  return KAGGLE_tasks, (train_dataset, valid_dataset, test_dataset), transformers