Commit d97cbca1 authored by Peter Eastman's avatar Peter Eastman
Browse files

More fixes to examples

parent bfcafcfd
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -30,7 +30,6 @@ model = dc.models.MultitaskClassifier(

# Fit trained model
model.fit(train_dataset, nb_epoch=1)
model.save()

print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
+4 −6
Original line number Diff line number Diff line
@@ -7,25 +7,23 @@ from __future__ import unicode_literals

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from toxcast_dataset import load_toxcast
from deepchem.molnet import load_toxcast
import deepchem as dc

toxcast_tasks, toxcast_datasets, transformers = load_toxcast(
    base_data_dir, reload=reload)
toxcast_tasks, toxcast_datasets, transformers = load_toxcast()
(train_dataset, valid_dataset, test_dataset) = toxcast_datasets

classification_metric = Metric(metrics.roc_auc_score, np.mean)
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)

def model_builder(model_dir):
  sklearn_model = RandomForestClassifier(
      class_weight="balanced", n_estimators=500, n_jobs=-1)
  return dc.models.SklearnModel(sklearn_model, model_dir)

model = SingletaskToMultitask(toxcast_tasks, model_builder)
model = dc.models.SingletaskToMultitask(toxcast_tasks, model_builder)

# Fit trained model
model.fit(train_dataset)
model.save()

print("About to evaluate model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
+8 −4
Original line number Diff line number Diff line
@@ -16,13 +16,12 @@ import matplotlib
matplotlib.use('Agg')
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from UV_datasets import load_uv

###Load data###
np.random.seed(123)
shard_size = 2000
print("About to load UV data.")
UV_tasks, datasets, transformers = load_uv(shard_size=shard_size)
UV_tasks, datasets, transformers = dc.molnet.load_uv(shard_size=shard_size)
train_dataset, valid_dataset, test_dataset = datasets

y_train = train_dataset.y
@@ -40,8 +39,13 @@ for task in range(n_tasks):
    all_results.append(r2)

# the histogram of the data
n, bins, patches = plt.hist(np.array(all_results), 50, normed=True, stacked=True,
                            facecolor='green', alpha=0.75)
n, bins, patches = plt.hist(
    np.array(all_results),
    50,
    normed=True,
    stacked=True,
    facecolor='green',
    alpha=0.75)
plt.xlabel('Cross-task Correlations')
plt.ylabel('Probability Density')
plt.title('Histogram of UV Intertask Correlations')
+11 −9
Original line number Diff line number Diff line
@@ -11,14 +11,13 @@ import tempfile
import shutil
import deepchem as dc
from sklearn.ensemble import RandomForestRegressor
from UV_datasets import load_uv

###Load data###
np.random.seed(123)
shard_size = 2000
num_trials = 5
print("About to load UV data.")
UV_tasks, datasets, transformers = load_uv(shard_size=shard_size)
UV_tasks, datasets, transformers = dc.molnet.load_uv(shard_size=shard_size)
train_dataset, valid_dataset, test_dataset = datasets
####################################################### DEBUG
print("np.amin(train_dataset.y)")
@@ -39,12 +38,16 @@ print("Num features: %d" % num_features)

metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean)


def task_model_builder(model_dir):
  sklearn_model = RandomForestRegressor(
      n_estimators=100, max_features=int(num_features/3),
      min_samples_split=5, n_jobs=-1)
      n_estimators=100,
      max_features=int(num_features / 3),
      min_samples_split=5,
      n_jobs=-1)
  return dc.models.SklearnModel(sklearn_model, model_dir)


all_results = []
for trial in range(num_trials):
  print("Starting trial %d" % trial)
@@ -61,9 +64,8 @@ for trial in range(num_trials):
  test_score, test_task_scores = model.evaluate(
      test_dataset, [metric], transformers, per_task_metrics=True)

  all_results.append((train_score, train_task_scores,
                      valid_score, valid_task_scores,
                      test_score, test_task_scores))
  all_results.append((train_score, train_task_scores, valid_score,
                      valid_task_scores, test_score, test_task_scores))

  print("----------------------------------------------------------------")
  print("Scores for trial %d" % trial)
@@ -84,8 +86,8 @@ for trial in range(num_trials):
print("####################################################################")

for trial in range(num_trials):
  (train_score, train_task_scores, valid_score, valid_task_scores,
   test_score, test_task_scores) = all_results[trial]
  (train_score, train_task_scores, valid_score, valid_task_scores, test_score,
   test_task_scores) = all_results[trial]
  print("----------------------------------------------------------------")
  print("Scores for trial %d" % trial)
  print("----------------------------------------------------------------")
+2 −4
Original line number Diff line number Diff line
@@ -11,13 +11,12 @@ import tempfile
import shutil
import numpy as np
import deepchem as dc
from UV_datasets import load_uv

###Load data###
shard_size = 2000
num_trials = 2
print("About to load UV data.")
UV_tasks, datasets, transformers = load_uv(shard_size=shard_size)
UV_tasks, datasets, transformers = dc.molnet.load_uv(shard_size=shard_size)
train_dataset, valid_dataset, test_dataset = datasets

print("Number of compounds in train set")
@@ -32,7 +31,7 @@ for trial in range(num_trials):
  ###Create model###
  n_layers = 3
  nb_epoch = 50
  model = dc.models.TensorflowMultitaskRegressor(
  model = dc.models.MultitaskRegressor(
      len(UV_tasks),
      train_dataset.get_data_shape()[0],
      layer_sizes=[1000] * n_layers,
@@ -42,7 +41,6 @@ for trial in range(num_trials):
      learning_rate=.0003,
      penalty=.0001,
      penalty_type="l2",
      optimizer="adam",
      batch_size=100,
      logdir="UV_tf_model")

Loading