Commit 5ba2e919 authored by miaecle's avatar miaecle
Browse files

style change

parent b5e2f694
Loading
Loading
Loading
Loading
+189 −107
Original line number Diff line number Diff line
@@ -21,10 +21,12 @@ from tensorflow.python.framework import test_util
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor


class TestOverfit(test_util.TensorFlowTestCase):
  """
  Test that models can overfit simple datasets.
  """

  def setUp(self):
    super(TestOverfit, self).setUp()
    self.current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -126,8 +128,11 @@ class TestOverfit(test_util.TensorFlowTestCase):
    regression_metric = dc.metrics.Metric(dc.metrics.mean_squared_error)
    # TODO(rbharath): This breaks with optimizer="momentum". Why?
    model = dc.models.TensorflowMultiTaskRegressor(
        n_tasks, n_features, dropouts=[0.],
        learning_rate=0.003, weight_init_stddevs=[np.sqrt(6)/np.sqrt(1000)],
        n_tasks,
        n_features,
        dropouts=[0.],
        learning_rate=0.003,
        weight_init_stddevs=[np.sqrt(6) / np.sqrt(1000)],
        batch_size=n_samples)

    # Fit trained model
@@ -155,8 +160,11 @@ class TestOverfit(test_util.TensorFlowTestCase):

    classification_metric = dc.metrics.Metric(dc.metrics.accuracy_score)
    model = dc.models.TensorflowMultiTaskClassifier(
        n_tasks, n_features, dropouts=[0.],
        learning_rate=0.0003, weight_init_stddevs=[.1],
        n_tasks,
        n_features,
        dropouts=[0.],
        learning_rate=0.0003,
        weight_init_stddevs=[.1],
        batch_size=n_samples)

    # Fit trained model
@@ -184,9 +192,13 @@ class TestOverfit(test_util.TensorFlowTestCase):
    fit_transformers = [dc.trans.CoulombFitTransformer(dataset)]
    regression_metric = dc.metrics.Metric(dc.metrics.mean_squared_error)
    model = dc.models.TensorflowMultiTaskFitTransformRegressor(
        n_tasks, [n_features, n_features], dropouts=[0.],
        learning_rate=0.003, weight_init_stddevs=[np.sqrt(6)/np.sqrt(1000)],
        batch_size=n_samples, fit_transformers=fit_transformers, n_evals=1)
        n_tasks, [n_features, n_features],
        dropouts=[0.],
        learning_rate=0.003,
        weight_init_stddevs=[np.sqrt(6) / np.sqrt(1000)],
        batch_size=n_samples,
        fit_transformers=fit_transformers,
        n_evals=1)

    # Fit trained model
    model.fit(dataset, nb_epoch=100)
@@ -214,11 +226,13 @@ class TestOverfit(test_util.TensorFlowTestCase):

    dataset = dc.data.NumpyDataset(X, y, w, ids)

    classification_metric = dc.metrics.Metric(
        dc.metrics.roc_auc_score)
    classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
    model = dc.models.TensorflowMultiTaskClassifier(
        n_tasks, n_features, dropouts=[0.],
        learning_rate=0.003, weight_init_stddevs=[.1],
        n_tasks,
        n_features,
        dropouts=[0.],
        learning_rate=0.003,
        weight_init_stddevs=[.1],
        batch_size=n_samples)

    # Fit trained model
@@ -257,11 +271,13 @@ class TestOverfit(test_util.TensorFlowTestCase):

    dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids)

    classification_metric = dc.metrics.Metric(
        dc.metrics.roc_auc_score)
    classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
    model = dc.models.TensorflowMultiTaskClassifier(
        n_tasks, n_features, dropouts=[0.],
        learning_rate=0.003, weight_init_stddevs=[1.],
        n_tasks,
        n_features,
        dropouts=[0.],
        learning_rate=0.003,
        weight_init_stddevs=[1.],
        batch_size=n_samples)

    # Fit trained model
@@ -289,9 +305,11 @@ class TestOverfit(test_util.TensorFlowTestCase):

    classification_metric = dc.metrics.Metric(
        dc.metrics.roc_auc_score, task_averager=np.mean)

    def model_builder(model_dir):
      sklearn_model = RandomForestClassifier()
      return dc.models.SklearnModel(sklearn_model, model_dir)

    model = dc.models.SingletaskToMultitask(tasks, model_builder)

    # Fit trained model
@@ -320,8 +338,11 @@ class TestOverfit(test_util.TensorFlowTestCase):
    classification_metric = dc.metrics.Metric(
        dc.metrics.accuracy_score, task_averager=np.mean)
    model = dc.models.TensorflowMultiTaskClassifier(
        n_tasks, n_features, dropouts=[0.],
        learning_rate=0.0003, weight_init_stddevs=[.1],
        n_tasks,
        n_features,
        dropouts=[0.],
        learning_rate=0.0003,
        weight_init_stddevs=[.1],
        batch_size=n_samples)

    # Fit trained model
@@ -350,9 +371,13 @@ class TestOverfit(test_util.TensorFlowTestCase):
    classification_metric = dc.metrics.Metric(
        dc.metrics.accuracy_score, task_averager=np.mean)
    model = dc.models.RobustMultitaskClassifier(
        n_tasks, n_features, layer_sizes=[50],
        bypass_layer_sizes=[10], dropouts=[0.],
        learning_rate=0.003, weight_init_stddevs=[.1],
        n_tasks,
        n_features,
        layer_sizes=[50],
        bypass_layer_sizes=[10],
        dropouts=[0.],
        learning_rate=0.003,
        weight_init_stddevs=[.1],
        batch_size=n_samples)

    # Fit trained model
@@ -381,7 +406,10 @@ class TestOverfit(test_util.TensorFlowTestCase):
    classification_metric = dc.metrics.Metric(
        dc.metrics.accuracy_score, task_averager=np.mean)
    model = dc.models.TensorflowLogisticRegression(
        n_tasks, n_features, learning_rate=0.5, weight_init_stddevs=[.01],
        n_tasks,
        n_features,
        learning_rate=0.5,
        weight_init_stddevs=[.01],
        batch_size=n_samples)

    # Fit trained model
@@ -421,7 +449,6 @@ class TestOverfit(test_util.TensorFlowTestCase):
    scores = model.evaluate(dataset_trans, [classification_metric])
    assert scores[classification_metric.name] > .9


  def test_sklearn_multitask_regression_overfit(self):
    """Test SKLearn singletask-to-multitask overfits tiny regression data."""
    n_tasks = 2
@@ -440,9 +467,11 @@ class TestOverfit(test_util.TensorFlowTestCase):

    regression_metric = dc.metrics.Metric(
        dc.metrics.r2_score, task_averager=np.mean)

    def model_builder(model_dir):
      sklearn_model = RandomForestRegressor()
      return dc.models.SklearnModel(sklearn_model, model_dir)

    model = dc.models.SingletaskToMultitask(tasks, model_builder)

    # Fit trained model
@@ -470,11 +499,13 @@ class TestOverfit(test_util.TensorFlowTestCase):
    dataset = dc.data.NumpyDataset(X, y, w, ids)

    regression_metric = dc.metrics.Metric(
        dc.metrics.mean_squared_error,
        task_averager=np.mean, mode="regression")
        dc.metrics.mean_squared_error, task_averager=np.mean, mode="regression")
    model = dc.models.TensorflowMultiTaskRegressor(
        n_tasks, n_features, dropouts=[0.],
        learning_rate=0.0003, weight_init_stddevs=[.1],
        n_tasks,
        n_features,
        dropouts=[0.],
        learning_rate=0.0003,
        weight_init_stddevs=[.1],
        batch_size=n_samples)

    # Fit trained model
@@ -504,12 +535,15 @@ class TestOverfit(test_util.TensorFlowTestCase):
    dataset = dc.data.NumpyDataset(X, y, w, ids)

    regression_metric = dc.metrics.Metric(
        dc.metrics.mean_squared_error,
        task_averager=np.mean, mode="regression")
        dc.metrics.mean_squared_error, task_averager=np.mean, mode="regression")
    model = dc.models.RobustMultitaskRegressor(
        n_tasks, n_features, layer_sizes=[50],
        bypass_layer_sizes=[10], dropouts=[0.],
        learning_rate=0.003, weight_init_stddevs=[.1],
        n_tasks,
        n_features,
        layer_sizes=[50],
        bypass_layer_sizes=[10],
        dropouts=[0.],
        learning_rate=0.003,
        weight_init_stddevs=[.1],
        batch_size=n_samples)

    # Fit trained model
@@ -539,8 +573,7 @@ class TestOverfit(test_util.TensorFlowTestCase):
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(input_file)

    classification_metric = dc.metrics.Metric(
        dc.metrics.accuracy_score)
    classification_metric = dc.metrics.Metric(dc.metrics.accuracy_score)

    n_feat = 75
    batch_size = 10
@@ -557,9 +590,15 @@ class TestOverfit(test_util.TensorFlowTestCase):

      with self.test_session() as sess:
        model = dc.models.MultitaskGraphClassifier(
          sess, graph_model, n_tasks, batch_size=batch_size,
          learning_rate=1e-3, learning_rate_decay_time=1000,
          optimizer_type="adam", beta1=.9, beta2=.999)
            sess,
            graph_model,
            n_tasks,
            batch_size=batch_size,
            learning_rate=1e-3,
            learning_rate_decay_time=1000,
            optimizer_type="adam",
            beta1=.9,
            beta2=.999)

        # Fit trained model
        model.fit(dataset, nb_epoch=20)
@@ -590,8 +629,7 @@ class TestOverfit(test_util.TensorFlowTestCase):
    dataset = loader.featurize(input_file)

    classification_metric = dc.metrics.Metric(
        dc.metrics.mean_squared_error,
        task_averager=np.mean)
        dc.metrics.mean_squared_error, task_averager=np.mean)

    n_feat = 75
    batch_size = 10
@@ -608,9 +646,15 @@ class TestOverfit(test_util.TensorFlowTestCase):

      with self.test_session() as sess:
        model = dc.models.MultitaskGraphRegressor(
          sess, graph_model, n_tasks, batch_size=batch_size,
          learning_rate=1e-2, learning_rate_decay_time=1000,
          optimizer_type="adam", beta1=.9, beta2=.999)
            sess,
            graph_model,
            n_tasks,
            batch_size=batch_size,
            learning_rate=1e-2,
            learning_rate_decay_time=1000,
            optimizer_type="adam",
            beta1=.9,
            beta2=.999)

        # Fit trained model
        model.fit(dataset, nb_epoch=40)
@@ -664,12 +708,18 @@ class TestOverfit(test_util.TensorFlowTestCase):

      with self.test_session() as sess:
        model = dc.models.SupportGraphClassifier(
          sess, support_model, test_batch_size=test_batch_size,
          support_batch_size=support_batch_size, learning_rate=1e-3)
            sess,
            support_model,
            test_batch_size=test_batch_size,
            support_batch_size=support_batch_size,
            learning_rate=1e-3)

        # Fit trained model. Dataset has 6 positives and 4 negatives, so set
        # n_pos/n_neg accordingly.
        model.fit(dataset, n_episodes_per_epoch=n_train_trials, n_pos=n_pos,
        model.fit(
            dataset,
            n_episodes_per_epoch=n_train_trials,
            n_pos=n_pos,
            n_neg=n_neg)
        model.save()

@@ -678,8 +728,12 @@ class TestOverfit(test_util.TensorFlowTestCase):
        # can measure model has memorized support).  Replacement is turned off to
        # ensure that support contains full training set. This checks that the
        # model has mastered memorization of provided support.
        scores, _ = model.evaluate(dataset, classification_metric, n_trials=5,
                                   n_pos=n_pos, n_neg=n_neg,
        scores, _ = model.evaluate(
            dataset,
            classification_metric,
            n_trials=5,
            n_pos=n_pos,
            n_neg=n_neg,
            exclude_support=False)

      # Measure performance on 0-th task.
@@ -726,17 +780,24 @@ class TestOverfit(test_util.TensorFlowTestCase):
      support_model.add_support(dc.nn.GraphGather(support_batch_size))

      # Apply an attention lstm layer
      support_model.join(dc.nn.AttnLSTMEmbedding(
          test_batch_size, support_batch_size, max_depth))
      support_model.join(
          dc.nn.AttnLSTMEmbedding(test_batch_size, support_batch_size,
                                  max_depth))

      with self.test_session() as sess:
        model = dc.models.SupportGraphClassifier(
          sess, support_model, test_batch_size=test_batch_size,
          support_batch_size=support_batch_size, learning_rate=1e-3)
            sess,
            support_model,
            test_batch_size=test_batch_size,
            support_batch_size=support_batch_size,
            learning_rate=1e-3)

        # Fit trained model. Dataset has 6 positives and 4 negatives, so set
        # n_pos/n_neg accordingly.
        model.fit(dataset, n_episodes_per_epoch=n_train_trials, n_pos=n_pos,
        model.fit(
            dataset,
            n_episodes_per_epoch=n_train_trials,
            n_pos=n_pos,
            n_neg=n_neg)
        model.save()

@@ -745,8 +806,12 @@ class TestOverfit(test_util.TensorFlowTestCase):
        # can measure model has memorized support).  Replacement is turned off to
        # ensure that support contains full training set. This checks that the
        # model has mastered memorization of provided support.
        scores, _ = model.evaluate(dataset, classification_metric, n_trials=5,
                                   n_pos=n_pos, n_neg=n_neg,
        scores, _ = model.evaluate(
            dataset,
            classification_metric,
            n_trials=5,
            n_pos=n_pos,
            n_neg=n_neg,
            exclude_support=False)

      # Measure performance on 0-th task.
@@ -773,8 +838,7 @@ class TestOverfit(test_util.TensorFlowTestCase):
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(input_file)

    classification_metric = dc.metrics.Metric(
        dc.metrics.accuracy_score)
    classification_metric = dc.metrics.Metric(dc.metrics.accuracy_score)

    with g.as_default():
      support_model = dc.nn.SequentialSupportGraph(n_feat)
@@ -793,18 +857,25 @@ class TestOverfit(test_util.TensorFlowTestCase):
      support_model.add_support(dc.nn.GraphGather(support_batch_size))

      # Apply a residual lstm layer
      support_model.join(dc.nn.ResiLSTMEmbedding(
          test_batch_size, support_batch_size, max_depth))
      support_model.join(
          dc.nn.ResiLSTMEmbedding(test_batch_size, support_batch_size,
                                  max_depth))

      with self.test_session() as sess:
        model = dc.models.SupportGraphClassifier(
          sess, support_model, test_batch_size=test_batch_size,
          support_batch_size=support_batch_size, learning_rate=1e-3)
            sess,
            support_model,
            test_batch_size=test_batch_size,
            support_batch_size=support_batch_size,
            learning_rate=1e-3)

        # Fit trained model. Dataset has 6 positives and 4 negatives, so set
        # n_pos/n_neg accordingly.

        model.fit(dataset, n_episodes_per_epoch=n_train_trials, n_pos=n_pos,
        model.fit(
            dataset,
            n_episodes_per_epoch=n_train_trials,
            n_pos=n_pos,
            n_neg=n_neg)
        model.save()

@@ -813,8 +884,12 @@ class TestOverfit(test_util.TensorFlowTestCase):
        # can measure model has memorized support).  Replacement is turned off to
        # ensure that support contains full training set. This checks that the
        # model has mastered memorization of provided support.
        scores, _ = model.evaluate(dataset, classification_metric, n_trials=5,
                                   n_pos=n_pos, n_neg=n_neg,
        scores, _ = model.evaluate(
            dataset,
            classification_metric,
            n_trials=5,
            n_pos=n_pos,
            n_neg=n_neg,
            exclude_support=False)

      # Measure performance on 0-th task.
@@ -839,9 +914,16 @@ class TestOverfit(test_util.TensorFlowTestCase):

    metric = dc.metrics.Metric(dc.metrics.rms_score, task_averager=np.mean)
    model = dc.models.ProgressiveMultitaskRegressor(
        n_tasks, n_features, layer_sizes=[50], bypass_layer_sizes=[10],
        dropouts=[0.], learning_rate=0.003, weight_init_stddevs=[.1], seed=123,
        alpha_init_stddevs=[.02], batch_size=n_samples)
        n_tasks,
        n_features,
        layer_sizes=[50],
        bypass_layer_sizes=[10],
        dropouts=[0.],
        learning_rate=0.003,
        weight_init_stddevs=[.1],
        seed=123,
        alpha_init_stddevs=[.02],
        batch_size=n_samples)

    # Fit trained model
    model.fit(dataset, nb_epoch=20)
+105 −82

File changed.

Preview size limit exceeded, changes collapsed.

+66 −45
Original line number Diff line number Diff line
@@ -12,12 +12,14 @@ import deepchem as dc
import scipy.io
import csv


def load_qm7_from_mat(featurizer=None, split='stratified'):
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "./qm7.mat")
  dataset_file = os.path.join(current_dir, "./qm7.mat")

  if not os.path.exists(dataset_file): os.system('wget -P '+current_dir+' http://www.quantum-machine.org/data/qm7.mat')
  if not os.path.exists(dataset_file):
    os.system('wget -P ' + current_dir +
              ' http://www.quantum-machine.org/data/qm7.mat')
  dataset = scipy.io.loadmat(dataset_file)

  X = dataset['X']
@@ -27,8 +29,7 @@ def load_qm7_from_mat(featurizer=None, split='stratified'):
  print(len(dataset))

  current_dir = os.path.dirname(os.path.realpath(__file__))
  split_file = os.path.join(
      current_dir, "./qm7_splits.csv")
  split_file = os.path.join(current_dir, "./qm7_splits.csv")

  split_indices = []
  with open(split_file, 'r') as f:
@@ -37,17 +38,23 @@ def load_qm7_from_mat(featurizer=None, split='stratified'):
      row_int = (np.asarray(list(map(int, row)))).tolist()
      split_indices.append(row_int)

  splitters = {'index': dc.splits.IndexSplitter(),
  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
      'indice': dc.splits.IndiceSplitter(valid_indices=split_indices[1]),
               'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=0)}
      'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=0)
  }
  splitter = splitters[split]
  train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset)
  train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
      dataset)
  print(len(train_dataset))
  print(len(valid_dataset))
  print(len(test_dataset))

  transformers = [dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
  transformers = [
      dc.trans.NormalizationTransformer(
          transform_y=True, dataset=train_dataset)
  ]

  for transformer in transformers:
    train_dataset = transformer.transform(train_dataset)
@@ -57,12 +64,14 @@ def load_qm7_from_mat(featurizer=None, split='stratified'):
  qm7_tasks = np.arange(y.shape[0])
  return qm7_tasks, (train_dataset, valid_dataset, test_dataset), transformers


def load_qm7b_from_mat(featurizer=None, split='stratified'):
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "./qm7b.mat")
  dataset_file = os.path.join(current_dir, "./qm7b.mat")

  if not os.path.exists(dataset_file): os.system('wget -P '+current_dir+' http://www.quantum-machine.org/data/qm7b.mat')
  if not os.path.exists(dataset_file):
    os.system('wget -P ' + current_dir +
              ' http://www.quantum-machine.org/data/qm7b.mat')
  dataset = scipy.io.loadmat(dataset_file)

  X = dataset['X']
@@ -71,8 +80,7 @@ def load_qm7b_from_mat(featurizer=None, split='stratified'):
  dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids=None)

  current_dir = os.path.dirname(os.path.realpath(__file__))
  split_file = os.path.join(
      current_dir, "./qm7_splits.csv")
  split_file = os.path.join(current_dir, "./qm7_splits.csv")

  split_indices = []
  with open(split_file, 'r') as f:
@@ -81,14 +89,20 @@ def load_qm7b_from_mat(featurizer=None, split='stratified'):
      row_int = (np.asarray(list(map(int, row)))).tolist()
      split_indices.append(row_int)

  splitters = {'index': dc.splits.IndexSplitter(),
  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
      'indice': dc.splits.IndiceSplitter(valid_indices=split_indices[1]),
               'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=0)}
      'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=0)
  }
  splitter = splitters[split]
  train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset)
  train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
      dataset)

  transformers = [dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
  transformers = [
      dc.trans.NormalizationTransformer(
          transform_y=True, dataset=train_dataset)
  ]

  for transformer in transformers:
    train_dataset = transformer.transform(train_dataset)
@@ -98,23 +112,24 @@ def load_qm7b_from_mat(featurizer=None, split='stratified'):
  qm7_tasks = np.arange(y.shape[1])
  return qm7_tasks, (train_dataset, valid_dataset, test_dataset), transformers

def load_qm7(featurizer=None, split='random'):

def load_qm7(featurizer=None, split='random'):
  """Load qm7 datasets."""
  # Featurize qm7 dataset
  print("About to featurize qm7 dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "./gdb7.sdf")
  dataset_file = os.path.join(current_dir, "./gdb7.sdf")
  qm7_tasks = ["u0_atom"]
  if featurizer is None:
    featurizer = dc.feat.CoulombMatrixEig(23)
  loader = dc.data.SDFLoader(tasks=qm7_tasks, smiles_field="smiles", 
                             mol_field="mol", featurizer=featurizer)
  loader = dc.data.SDFLoader(
      tasks=qm7_tasks,
      smiles_field="smiles",
      mol_field="mol",
      featurizer=featurizer)
  dataset = loader.featurize(dataset_file)

  split_file = os.path.join(
      current_dir, "./qm7_splits.csv")
  split_file = os.path.join(current_dir, "./qm7_splits.csv")

  split_indices = []
  with open(split_file, 'r') as f:
@@ -123,14 +138,20 @@ def load_qm7(featurizer=None, split='random'):
      row_int = (np.asarray(list(map(int, row)))).tolist()
      split_indices.append(row_int)

  splitters = {'index': dc.splits.IndexSplitter(),
  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
      'indice': dc.splits.IndiceSplitter(valid_indices=split_indices[1]),
               'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=0)}
      'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=0)
  }
  splitter = splitters[split]
  train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset)
  train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
      dataset)

  transformers = [dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
  transformers = [
      dc.trans.NormalizationTransformer(
          transform_y=True, dataset=train_dataset)
  ]

  for transformer in transformers:
    train_dataset = transformer.transform(train_dataset)