Commit 5bd27fcd authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Adding Kaggle example

parent 7fa45771
Loading
Loading
Loading
Loading
+60 −0
Original line number Diff line number Diff line
"""
Script that trains RF model on KAGGLE datasets.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import tempfile
import shutil
import deepchem as dc
from sklearn.ensemble import RandomForestRegressor
from MERCK_datasets import load_kaggle

###Load data###
np.random.seed(123)
shard_size = 2000
num_shards_per_batch = 4
print("About to load KAGGLE data.")
KAGGLE_tasks, datasets, transformers = load_kaggle(
    shard_size=shard_size, num_shards_per_batch=num_shards_per_batch)
train_dataset, valid_dataset, test_dataset = datasets

print("Number of compounds in train set")
print(len(train_dataset))
print("Number of compounds in validation set")
print(len(valid_dataset))
print("Number of compounds in test set")
print(len(test_dataset))

num_features = train_dataset.get_data_shape()[0]
print("Num features: %d" % num_features)

def task_model_builder(model_dir):
  sklearn_model = RandomForestRegressor(
      n_estimators=100, max_features=int(num_features/3),
      min_samples_split=5, n_jobs=-1)
  return dc.models.SklearnModel(sklearn_model, model_dir)
model = dc.models.SingletaskToMultitask(KAGGLE_tasks, task_model_builder)

###Evaluate models###
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean)

print("Training model")
model.fit(train_dataset)

train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
#Only use for final evaluation
test_scores = model.evaluate(test_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)

print("Test scores")
print(test_scores)
+65 −0
Original line number Diff line number Diff line
"""
Script that trains Tensorflow Multitask models on KAGGLE datasets.
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import tempfile
import shutil
import numpy as np
import deepchem as dc
from MERCK_datasets import load_kaggle

# Set numpy seed
np.random.seed(123)

###Load data###
shard_size = 2000
num_shards_per_batch = 4
print("About to load MERCK data.")
KAGGLE_tasks, datasets, transformers = load_kaggle(
    shard_size=shard_size, num_shards_per_batch=num_shards_per_batch)
train_dataset, valid_dataset, test_dataset = datasets

print("KAGGLE_tasks")
print(KAGGLE_tasks)
print("Number of compounds in train set")
print(len(train_dataset))
print("Number of compounds in validation set")
print(len(valid_dataset))
print("Number of compounds in test set")
print(len(test_dataset))

###Create model###
n_layers = 3 
nb_epoch = 300 
model = dc.models.TensorflowMultiTaskRegressor(
    len(KAGGLE_tasks), train_dataset.get_data_shape()[0],
    layer_sizes=[1000]*n_layers, dropouts=[.25]*n_layers,
    weight_init_stddevs=[.02]*n_layers,
    bias_init_consts=[1.]*n_layers, learning_rate=.0003,
    penalty=.0001, penalty_type="l2", optimizer="adam", batch_size=100,
    seed=123, verbosity="high")

#Use R2 classification metric
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean)

print("Training model")
model.fit(train_dataset, nb_epoch=nb_epoch)

train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
#Only use for final evaluation
test_scores = model.evaluate(test_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)

print("Test scores")
print(test_scores)
+3 −0
Original line number Diff line number Diff line
wget http://deepchem.io/datasets/KAGGLE_training_disguised_combined_full.csv.gz
wget http://deepchem.io/datasets/KAGGLE_test1_disguised_combined_full.csv.gz
wget http://deepchem.io/datasets/KAGGLE_test2_disguised_combined_full.csv.gz
+94 −0
Original line number Diff line number Diff line
"""
KAGGLE dataset loader.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import shutil
import time
import numpy as np
import deepchem as dc
from merck_features_full import merck_descriptors 

def remove_missing_entries(dataset):
  """Remove missing entries.

  Some of the datasets have missing entries that sneak in as zero'd out
  feature vectors. Get rid of them.
  """
  for i, (X, y, w, ids) in enumerate(dataset.itershards()):
    available_rows = X.any(axis=1)
    print("Shard %d has %d missing entries."
        % (i, np.count_nonzero(~available_rows)))
    X = X[available_rows]
    y = y[available_rows]
    w = w[available_rows]
    ids = ids[available_rows]
    dataset.set_shard(i, X, y, w, ids)

# Set shard size low to avoid memory problems.
def load_kaggle(shard_size=10000, num_shards_per_batch=4):
  """Load KAGGLE datasets. Does not do train/test split"""
  ############################################################## TIMING
  time1 = time.time()
  ############################################################## TIMING
  # Set some global variables up top
  train_files = ("../merck_datasets/KAGGLE_processed/"
                 "KAGGLE_training_disguised_combined_full.csv.gz")
  valid_files = ("../merck_datasets/KAGGLE_processed/"
                 "KAGGLE_test1_disguised_combined_full.csv.gz")
  test_files = ("../merck_datasets/KAGGLE_processed/"
                "KAGGLE_test2_disguised_combined_full.csv.gz")

  # Featurize KAGGLE dataset
  print("About to featurize KAGGLE dataset.")
  featurizer = dc.feat.UserDefinedFeaturizer(merck_descriptors)
  KAGGLE_tasks = ['3A4', 'CB1', 'DPP4', 'HIVINT', 'HIV_PROT', 'LOGD', 'METAB',
                  'NK1', 'OX1', 'OX2', 'PGP', 'PPB', 'RAT_F', 'TDI',
                  'THROMBIN']

  loader = dc.load.DataLoader(
      tasks=KAGGLE_tasks, id_field="Molecule",
      featurizer=featurizer, verbosity="high")
  train_datasets, valid_datasets, test_datasets = [], [], []
  print("Featurizing train datasets")
  train_dataset = loader.featurize(
      train_files,
      shard_size=shard_size, num_shards_per_batch=num_shards_per_batch)

  print("Featurizing valid datasets")
  valid_dataset = loader.featurize(
      valid_files, shard_size=shard_size)

  print("Featurizing test datasets")
  test_dataset = loader.featurize(
      test_files, shard_size=shard_size)

  print("Remove missing entries from datasets.")
  remove_missing_entries(train_dataset)
  remove_missing_entries(valid_dataset)
  remove_missing_entries(test_dataset)

  print("Transforming datasets with transformers.")
  transformers = [
      dc.trans.LogTransformer(transform_X=True),
      dc.trans.NormalizationTransformer(transform_y=True,
                                        dataset=train_dataset)]
  for transformer in transformers:
    print("Performing transformations with %s"
          % transformer.__class__.__name__)
    for dataset in [train_dataset, valid_dataset, test_dataset]:
      print("Transforming dataset")
      transformer.transform(dataset)

  print("Shuffling order of train dataset.")
  train_dataset.sparse_shuffle()

  ############################################################## TIMING
  time2 = time.time()
  print("TIMING: KAGGLE fitting took %0.3f s" % (time2-time1))
  ############################################################## TIMING
  
  return KAGGLE_tasks, (train_dataset, valid_dataset, test_dataset), transformers
+1 −0

File added.

Preview size limit exceeded, changes collapsed.