Commit 5e15315e authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #348 from rbharath/new_datasets

Adds in factors/kinase/uv example datasets
parents 1a05c220 e42745d9
Loading
Loading
Loading
Loading
+83 −0
Original line number Diff line number Diff line
"""
FACTORS dataset loader.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import shutil
import time
import numpy as np
import deepchem as dc
from factors_features import factors_descriptors 

def remove_missing_entries(dataset):
  """Remove missing entries.

  Some of the datasets have missing entries that sneak in as zero'd out
  feature vectors. Get rid of them.
  """
  for i, (X, y, w, ids) in enumerate(dataset.itershards()):
    available_rows = X.any(axis=1)
    print("Shard %d has %d missing entries."
        % (i, np.count_nonzero(~available_rows)))
    X = X[available_rows]
    y = y[available_rows]
    w = w[available_rows]
    ids = ids[available_rows]
    dataset.set_shard(i, X, y, w, ids)

def load_factors(shard_size=10000, num_shards_per_batch=4):
  """Load Factor datasets."""
  verbosity = "high"
  train_files = ("FACTORS_training_disguised_combined_full.csv.gz")
  valid_files = ("FACTORS_test1_disguised_combined_full.csv.gz")
  test_files = ("FACTORS_test2_disguised_combined_full.csv.gz")

  # Featurize FACTORS dataset
  print("About to featurize FACTORS dataset.")
  featurizer = dc.feat.UserDefinedFeaturizer(merck_descriptors)
  FACTORS_tasks = (['T_0000%d' % i for i in range(1, 10)]
                   + ['T_000%d' % i for i in range(10, 13)])

  loader = dc.load.DataLoader(
      tasks=FACTORS_tasks, id_field="Molecule",
      featurizer=featurizer, verbosity=verbosity)

  train_datasets, valid_datasets, test_datasets = [], [], []
  print("Featurizing train datasets")
  train_dataset = loader.featurize(
      train_files, 
      shard_size=shard_size, num_shards_per_batch=num_shards_per_batch)

  print("Featurizing valid datasets")
  valid_dataset = loader.featurize(
      valid_files, shard_size=shard_size)

  print("Featurizing test datasets")
  print("Creating test dataset")
  test_dataset = loader.featurize(
      test_files, shard_size=shard_size)

  print("Remove missing entries from datasets.")
  remove_missing_entries(train_dataset)
  remove_missing_entries(valid_dataset)
  remove_missing_entries(test_dataset)

  print("Transforming datasets with transformers.")
  transformers = [
      dc.trans.LogTransformer(transform_X=True),
      dc.trans.NormalizationTransformer(transform_y=True,
                                        dataset=train_dataset)]
  for transformer in transformers:
    print("Performing transformations with %s"
          % transformer.__class__.__name__)
    for dataset in [train_dataset, valid_dataset, test_dataset]:
      print("Transforming dataset")
      transformer.transform(dataset)

  print("Shuffling order of train dataset.")
  train_dataset.sparse_shuffle()
  
  return FACTORS_tasks, (train_dataset, valid_dataset, test_dataset), transformers
+62 −0
Original line number Diff line number Diff line
"""
Script that trains RF model on FACTORS datasets.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import tempfile
import shutil
import deepchem as dc
from sklearn.ensemble import RandomForestRegressor
from MERCK_datasets import load_factors

###Load data###
np.random.seed(123)
shard_size = 2000
num_cores = 1
num_shards_per_batch = 4
print("About to load FACTORS data.")
FACTORS_tasks, datasets, transformers = load_factors(
    shard_size=shard_size, num_shards_per_batch=num_shards_per_batch)
train_dataset, valid_dataset, test_dataset = datasets

print("Number of compounds in train set")
print(len(train_dataset))
print("Number of compounds in validation set")
print(len(valid_dataset))
print("Number of compounds in test set")
print(len(test_dataset))

num_features = train_dataset.get_data_shape()[0]
print("Num features: %d" % num_features)

def task_model_builder(model_dir):
  sklearn_model = RandomForestRegressor(
      n_estimators=100, max_features=int(num_features/3),
      min_samples_split=5, n_jobs=-1)
  return dc.models.SklearnModel(sklearn_model, model_dir)
model = dc.models.SingletaskToMultitask(FACTORS_tasks, task_model_builder)

###Evaluate models###
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean,
                           mode="regression")

print("Training model")
model.fit(train_dataset)

train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
#Only use for final evaluation
test_scores = model.evaluate(test_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)

print("Test scores")
print(test_scores)
+64 −0
Original line number Diff line number Diff line
"""
Script that trains Tensorflow Multitask models on FACTORS datasets.
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import tempfile
import shutil
import numpy as np
import deepchem as dc
from MERCK_datasets import load_factors

# Set numpy seed
np.random.seed(123)

###Load data###
shard_size = 2000
num_shards_per_batch = 4
print("About to load MERCK data.")
FACTORS_tasks, datasets, transformers = load_factors(
    shard_size=shard_size, num_shards_per_batch=num_shards_per_batch)
train_dataset, valid_dataset, test_dataset = datasets

print("Number of compounds in train set")
print(len(train_dataset))
print("Number of compounds in validation set")
print(len(valid_dataset))
print("Number of compounds in test set")
print(len(test_dataset))

###Create model###
n_layers = 3
nb_epoch = 125 
#nb_epoch = 100
model = dc.models.TensorflowMultiTaskRegressor(
    len(FACTORS_tasks), train_dataset.get_data_shape()[0],
    layer_sizes=[1000]*n_layers, dropouts=[.25]*n_layers,
    weight_init_stddevs=[.02]*n_layers,
    bias_init_consts=[1.]*n_layers, learning_rate=.0003,
    penalty=.0001, penalty_type="l2", optimizer="adam", batch_size=100,
    seed=123, verbosity="high")

#Use R2 classification metric
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean)

print("Training model")
model.fit(train_dataset, nb_epoch=nb_epoch)

train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
#Only use for final evaluation
test_scores = model.evaluate(test_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)

print("Test scores")
print(test_scores)
+65 −0
Original line number Diff line number Diff line
"""
Script that trains Tensorflow Progressive Multitask models on FACTORS datasets.
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import tempfile
import shutil
import numpy as np
import deepchem as dc
from MERCK_datasets import load_factors

# Set numpy seed
np.random.seed(123)

###Load data###
shard_size = 2000
num_shards_per_batch = 4
print("About to load MERCK data.")
FACTORS_tasks, datasets, transformers = load_factors(
    shard_size=shard_size, num_shards_per_batch=num_shards_per_batch)
train_dataset, valid_dataset, test_dataset = datasets

print("Number of compounds in train set")
print(len(train_dataset))
print("Number of compounds in validation set")
print(len(valid_dataset))
print("Number of compounds in test set")
print(len(test_dataset))

###Create model###
n_layers = 3
nb_epoch = 50
model = dc.models.ProgressiveMultitaskRegressor(
    len(FACTORS_tasks), train_dataset.get_data_shape()[0],
    layer_sizes=[750]*n_layers, dropouts=[.25]*n_layers,
    alpha_init_stddevs=[.02]*n_layers, weight_init_stddevs=[.02]*n_layers,
    bias_init_consts=[1.]*n_layers, learning_rate=.0003,
    penalty=.0001, penalty_type="l2", optimizer="adam", batch_size=100,
    seed=123, verbosity="high")


#Use R2 classification metric
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean)

print("Training model")
model.fit(train_dataset, nb_epoch=nb_epoch)
#model.old_fit(train_dataset, nb_epoch=nb_epoch)

train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
#Only use for final evaluation
test_scores = model.evaluate(test_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)

print("Test scores")
print(test_scores)
+67 −0
Original line number Diff line number Diff line
"""
Script that trains Tensorflow Multitask models on FACTORS dataset.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import tempfile
import shutil
import deepchem as dc
from MERCK_datasets import load_factors

# Set numpy seed
np.random.seed(123)

###Load data###
shard_size = 2000
num_shards_per_batch = 4

print("About to load FACTORS data.")
FACTORS_tasks, datasets, transformers = load_factors(
    shard_size=shard_size, num_shards_per_batch=num_shards_per_batch)
train_dataset, valid_dataset, test_dataset = datasets

print("Number of compounds in train set")
print(len(train_dataset))
print("Number of compounds in validation set")
print(len(valid_dataset))
print("Number of compounds in test set")
print(len(test_dataset))

###Create model###
n_layers = 3
#nb_epoch = 50
#nb_epoch = 125
nb_epoch = 10
n_features = train_dataset.get_data_shape()[0]
def task_model_builder(m_dir):
  return dc.models.TensorflowMultiTaskRegressor(
      n_tasks=1, n_features=n_features, logdir=m_dir,
      layer_sizes=[1000]*n_layers, dropouts=[.25]*n_layers,
      weight_init_stddevs=[.02]*n_layers, bias_init_consts=[1.]*n_layers,
      learning_rate=.0003, penalty=.0001, penalty_type="l2", optimizer="adam",
      batch_size=100, seed=123, verbosity="high")
model = dc.models.SingletaskToMultitask(FACTORS_tasks, task_model_builder)

###Evaluate models###
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean)

print("Fitting Model")
model.fit(train_dataset, nb_epoch=nb_epoch)

train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
#Only use for final evaluation
test_scores = model.evaluate(test_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)

print("Test scores")
print(test_scores)
Loading