Unverified Commit 635c9b09 authored by Vignesh Ram Somnath's avatar Vignesh Ram Somnath Committed by GitHub
Browse files

Merge pull request #3 from deepchem/master

parents 1661fdba 2a0aeacc
Loading
Loading
Loading
Loading
+25 −9
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ import time
import shutil
import json
from multiprocessing.dummy import Pool
import warnings

__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
@@ -955,7 +956,7 @@ class DiskDataset(Dataset):

  @staticmethod
  def from_numpy(X,
                 y,
                 y=None,
                 w=None,
                 ids=None,
                 tasks=None,
@@ -965,14 +966,29 @@ class DiskDataset(Dataset):
    n_samples = len(X)
    if ids is None:
      ids = np.arange(n_samples)

    if y is not None:
      if w is None:
        w = np.ones_like(y)

      if tasks is None:
        if len(y.shape) > 1:
          n_tasks = y.shape[1]
        else:
          n_tasks = 1
        tasks = np.arange(n_tasks)

    else:
      if w is not None:
        warnings.warn('y is None but w is not None. Setting w to None',
                      UserWarning)
        w = None

      if tasks is not None:
        warnings.warn('y is None but tasks is not None. Setting tasks to None',
                      UserWarning)
        tasks = None

    # raw_data = (X, y, w, ids)
    return DiskDataset.create_dataset(
        [(X, y, w, ids)], data_dir=data_dir, tasks=tasks, verbose=verbose)
+1 −1
Original line number Diff line number Diff line
@@ -140,7 +140,7 @@ class TestHyperparamOpt(unittest.TestCase):

    transformers = []
    metric = dc.metrics.Metric(
        dc.metrics.matthews_corrcoef, np.mean, mode="classification")
        dc.metrics.roc_auc_score, np.mean, mode="classification")
    params_dict = {"layer_sizes": [(10,), (100,)]}

    def model_builder(model_params, model_dir):
+64 −57
Original line number Diff line number Diff line
@@ -207,6 +207,11 @@ class Metric(object):
      else:
        raise ValueError("Must specify mode for new metric.")
    assert mode in ["classification", "regression"]
    if self.metric.__name__ in [
        "accuracy_score", "balanced_accuracy_score", "recall_score",
        "matthews_corrcoef"
    ] and threshold is None:
      self.threshold = 0.5
    self.mode = mode
    # The convention used is that the first task is the metric.
    # TODO(rbharath, joegomes): This doesn't seem like it should be hard-coded as
@@ -293,6 +298,7 @@ class Metric(object):
    Raises:
      NotImplementedError: If metric_str is not in METRICS.
    """

    y_true = np.array(np.squeeze(y_true[w != 0]))
    y_pred = np.array(np.squeeze(y_pred[w != 0]))

@@ -304,7 +310,8 @@ class Metric(object):
    if not y_true.size:
      return np.nan
    if self.threshold is not None:
      y_pred = np.greater(y_pred, threshold)
      y_pred = y_pred[:, 1]
      y_pred = np.greater(y_pred, self.threshold)
    if len(y_true.shape) == 0:
      y_true = np.expand_dims(y_true, 0)
    if len(y_pred.shape) == 0:
+17 −28
Original line number Diff line number Diff line
@@ -9,7 +9,7 @@ import tempfile
import numpy as np
import tensorflow as tf
import deepchem as dc
from datasets import load_sider_convmol
from deepchem.models.tensorgraph.models.graph_models import GraphConvModel

# 4-fold splits
K = 4
@@ -19,23 +19,21 @@ n_neg = 10
# 10 trials on test-set
n_trials = 20

sider_tasks, dataset, transformers = load_sider_convmol()
sider_tasks, fold_datasets, transformers = dc.molnet.load_sider(
    featurizer='GraphConv', split="task")

# Define metric
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, mode="classification")

task_splitter = dc.splits.TaskSplitter()
fold_datasets = task_splitter.k_fold_split(dataset, K)

train_folds = fold_datasets[:-1]
train_dataset = dc.splits.merge_fold_datasets(train_folds)
test_dataset = fold_datasets[-1]

# Get supports on test-set
support_generator = dc.data.SupportGenerator(test_dataset, n_pos, n_neg,
                                             n_trials)

# Compute accuracies

task_scores = {task: [] for task in range(len(test_dataset.get_task_names()))}

for trial_num, (task, support) in enumerate(support_generator):
@@ -45,27 +43,9 @@ for trial_num, (task, support) in enumerate(support_generator):
  n_feat = 75
  # Batch size of models
  batch_size = 50
  graph_model = dc.nn.SequentialGraph(n_feat)
  graph_model.add(dc.nn.GraphConv(64, n_feat, activation='relu'))
  graph_model.add(dc.nn.GraphPool())
  graph_model.add(dc.nn.GraphConv(128, 64, activation='relu'))
  graph_model.add(dc.nn.GraphPool())
  graph_model.add(dc.nn.GraphConv(64, 128, activation='relu'))
  graph_model.add(dc.nn.GraphPool())
  graph_model.add(dc.nn.Dense(128, 64, activation='tanh'))
  graph_model.add(dc.nn.GraphGather(batch_size, activation="tanh"))

  model = dc.models.MultitaskGraphClassifier(
      graph_model,
      1,
      n_feat,
      batch_size=batch_size,
      learning_rate=1e-3,
      learning_rate_decay_time=1000,
      optimizer_type="adam",
      beta1=.9,
      beta2=.999)

  #graph_model = dc.nn.SequentialGraph(n_feat)
  model = GraphConvModel(
      1, graph_conv_layers=[64, 128, 64], batch_size=batch_size)
  # Fit trained model
  model.fit(support, nb_epoch=10)

@@ -89,4 +69,13 @@ print(mean_task_scores)
print("Standard Deviations")
print(std_task_scores)
print("Median of Mean Scores")
"""
To support both python 3.x and 2.7
dict.values() returns an object of type dict_values
and np.median shouts loudly if this is the case so 
converted it to list before passing it to np.array()
"""
try:
  print(np.median(np.array(mean_task_scores.values())))
except TypeError as e:
  print(np.median(np.array(list(mean_task_scores.values()))))