Unverified Commit 6de39a63 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #976 from rbharath/trim

Remove Deprecated Classes
parents 494c8dee 33c018b9
Loading
Loading
Loading
Loading
+0 −4
Original line number Diff line number Diff line
@@ -8,10 +8,6 @@ from __future__ import unicode_literals
from deepchem.models.models import Model
from deepchem.models.sklearn_models import SklearnModel
from deepchem.models.xgboost_models import XGBoostModel
from deepchem.models.tf_new_models.multitask_classifier import MultitaskGraphClassifier
from deepchem.models.tf_new_models.multitask_regressor import MultitaskGraphRegressor, DTNNMultitaskGraphRegressor

from deepchem.models.tf_new_models.support_classifier import SupportGraphClassifier
from deepchem.models.multitask import SingletaskToMultitask

from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskRegressor
+0 −525
Original line number Diff line number Diff line
@@ -805,155 +805,6 @@ class TestOverfit(test_util.TensorFlowTestCase):
    scores = model.evaluate(dataset, [regression_metric])
    assert scores[regression_metric.name] < .2

  def test_graph_conv_singletask_classification_overfit(self):
    """Test graph-conv multitask overfits tiny data."""
    np.random.seed(123)
    tf.set_random_seed(123)
    g = tf.Graph()
    sess = tf.Session(graph=g)
    n_tasks = 1
    n_samples = 10
    n_features = 3
    n_classes = 2

    # Load mini log-solubility dataset.
    featurizer = dc.feat.ConvMolFeaturizer()
    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir, "example_classification.csv")
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(input_file)

    classification_metric = dc.metrics.Metric(dc.metrics.accuracy_score)

    n_feat = 75
    batch_size = 10

    graph_model = dc.nn.SequentialGraph(n_feat)
    graph_model.add(dc.nn.GraphConv(64, n_feat, activation='relu'))
    graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
    graph_model.add(dc.nn.GraphPool())
    # Gather Projection
    graph_model.add(dc.nn.Dense(128, 64, activation='relu'))
    graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
    graph_model.add(dc.nn.GraphGather(batch_size, activation="tanh"))

    model = dc.models.MultitaskGraphClassifier(
        graph_model,
        n_tasks,
        n_feat,
        batch_size=batch_size,
        learning_rate=1e-3,
        learning_rate_decay_time=1000,
        optimizer_type="adam",
        beta1=.9,
        beta2=.999)

    # Fit trained model
    model.fit(dataset, nb_epoch=20)
    model.save()

    # Eval model on train
    scores = model.evaluate(dataset, [classification_metric])

    assert scores[classification_metric.name] > .65

  def test_graph_conv_singletask_regression_overfit(self):
    """Test graph-conv multitask overfits tiny data."""
    np.random.seed(123)
    tf.set_random_seed(123)
    g = tf.Graph()
    sess = tf.Session(graph=g)
    n_tasks = 1
    n_samples = 10
    n_features = 3
    n_classes = 2

    # Load mini log-solubility dataset.
    featurizer = dc.feat.ConvMolFeaturizer()
    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir, "example_regression.csv")
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(input_file)

    classification_metric = dc.metrics.Metric(
        dc.metrics.mean_squared_error, task_averager=np.mean)

    n_feat = 75
    batch_size = 10

    graph_model = dc.nn.SequentialGraph(n_feat)
    graph_model.add(dc.nn.GraphConv(64, n_feat, activation='relu'))
    graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
    graph_model.add(dc.nn.GraphPool())
    # Gather Projection
    graph_model.add(dc.nn.Dense(128, 64))
    graph_model.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
    graph_model.add(dc.nn.GraphGather(batch_size, activation="tanh"))

    model = dc.models.MultitaskGraphRegressor(
        graph_model,
        n_tasks,
        n_feat,
        batch_size=batch_size,
        learning_rate=1e-2,
        learning_rate_decay_time=1000,
        optimizer_type="adam",
        beta1=.9,
        beta2=.999)

    # Fit trained model
    model.fit(dataset, nb_epoch=40)
    model.save()

    # Eval model on train
    scores = model.evaluate(dataset, [classification_metric])

    assert scores[classification_metric.name] < .2

  def test_DTNN_multitask_regression_overfit(self):
    """Test deep tensor neural net overfits tiny data."""
    np.random.seed(123)
    tf.set_random_seed(123)

    input_file = os.path.join(self.current_dir, "example_DTNN.mat")
    dataset = scipy.io.loadmat(input_file)
    X = dataset['X']
    y = dataset['T']
    w = np.ones_like(y)
    dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids=None)
    regression_metric = dc.metrics.Metric(
        dc.metrics.pearson_r2_score, task_averager=np.mean)
    n_tasks = y.shape[1]
    batch_size = 10

    graph_model = dc.nn.SequentialDTNNGraph()
    graph_model.add(dc.nn.DTNNEmbedding(n_embedding=20))
    graph_model.add(dc.nn.DTNNStep(n_embedding=20))
    graph_model.add(dc.nn.DTNNStep(n_embedding=20))
    graph_model.add(dc.nn.DTNNGather(n_embedding=20))
    n_feat = 20
    model = dc.models.MultitaskGraphRegressor(
        graph_model,
        n_tasks,
        n_feat,
        batch_size=batch_size,
        learning_rate=1e-3,
        learning_rate_decay_time=1000,
        optimizer_type="adam",
        beta1=.9,
        beta2=.999)

    # Fit trained model
    model.fit(dataset, nb_epoch=20)
    model.save()

    # Eval model on train
    scores = model.evaluate(dataset, [regression_metric])

    assert scores[regression_metric.name] > .9

  def test_tensorgraph_DTNN_multitask_regression_overfit(self):
    """Test deep tensor neural net overfits tiny data."""
    np.random.seed(123)
@@ -1069,51 +920,6 @@ class TestOverfit(test_util.TensorFlowTestCase):

    assert scores[regression_metric.name] > .8

  def test_DAG_singletask_regression_overfit(self):
    """Test DAG regressor multitask overfits tiny data."""
    np.random.seed(123)
    tf.set_random_seed(123)
    n_tasks = 1

    # Load mini log-solubility dataset.
    featurizer = dc.feat.ConvMolFeaturizer()
    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir, "example_regression.csv")
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(input_file)

    regression_metric = dc.metrics.Metric(
        dc.metrics.pearson_r2_score, task_averager=np.mean)

    n_feat = 75
    batch_size = 10
    transformer = dc.trans.DAGTransformer(max_atoms=50)
    dataset = transformer.transform(dataset)

    graph = dc.nn.SequentialDAGGraph(n_atom_feat=n_feat, max_atoms=50)
    graph.add(dc.nn.DAGLayer(30, n_feat, max_atoms=50, batch_size=batch_size))
    graph.add(dc.nn.DAGGather(30, max_atoms=50))

    model = dc.models.MultitaskGraphRegressor(
        graph,
        n_tasks,
        n_feat,
        batch_size=batch_size,
        learning_rate=0.001,
        learning_rate_decay_time=1000,
        optimizer_type="adam",
        beta1=.9,
        beta2=.999)

    # Fit trained model
    model.fit(dataset, nb_epoch=50)
    model.save()
    # Eval model on train
    scores = model.evaluate(dataset, [regression_metric])

    assert scores[regression_metric.name] > .8

  def test_tensorgraph_DAG_singletask_regression_overfit(self):
    """Test DAG regressor multitask overfits tiny data."""
    np.random.seed(123)
@@ -1152,61 +958,6 @@ class TestOverfit(test_util.TensorFlowTestCase):

    assert scores[regression_metric.name] > .8

  def test_weave_singletask_classification_overfit(self):
    """Test weave model overfits tiny data."""
    np.random.seed(123)
    tf.set_random_seed(123)
    n_tasks = 1

    # Load mini log-solubility dataset.
    featurizer = dc.feat.WeaveFeaturizer()
    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir, "example_classification.csv")
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(input_file)

    classification_metric = dc.metrics.Metric(dc.metrics.accuracy_score)

    n_atom_feat = 75
    n_pair_feat = 14
    n_feat = 128
    batch_size = 10
    max_atoms = 50

    graph = dc.nn.AlternateSequentialWeaveGraph(
        batch_size,
        max_atoms=max_atoms,
        n_atom_feat=n_atom_feat,
        n_pair_feat=n_pair_feat)
    graph.add(dc.nn.AlternateWeaveLayer(max_atoms, 75, 14))
    graph.add(dc.nn.AlternateWeaveLayer(max_atoms, 50, 50, update_pair=False))
    graph.add(dc.nn.Dense(n_feat, 50, activation='tanh'))
    graph.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
    graph.add(
        dc.nn.AlternateWeaveGather(
            batch_size, n_input=n_feat, gaussian_expand=True))

    model = dc.models.MultitaskGraphClassifier(
        graph,
        n_tasks,
        n_feat,
        batch_size=batch_size,
        learning_rate=1e-3,
        learning_rate_decay_time=1000,
        optimizer_type="adam",
        beta1=.9,
        beta2=.999)

    # Fit trained model
    model.fit(dataset, nb_epoch=20)
    model.save()

    # Eval model on train
    scores = model.evaluate(dataset, [classification_metric])

    assert scores[classification_metric.name] > .65

  def test_tensorgraph_weave_singletask_classification_overfit(self):
    """Test weave model overfits tiny data."""
    np.random.seed(123)
@@ -1246,62 +997,6 @@ class TestOverfit(test_util.TensorFlowTestCase):

    assert scores[classification_metric.name] > .65

  def test_weave_singletask_regression_overfit(self):
    """Test weave model overfits tiny data."""
    np.random.seed(123)
    tf.set_random_seed(123)
    n_tasks = 1

    # Load mini log-solubility dataset.
    featurizer = dc.feat.WeaveFeaturizer()
    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir, "example_regression.csv")
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(input_file)

    regression_metric = dc.metrics.Metric(
        dc.metrics.pearson_r2_score, task_averager=np.mean)

    n_atom_feat = 75
    n_pair_feat = 14
    n_feat = 128
    batch_size = 10
    max_atoms = 50

    graph = dc.nn.AlternateSequentialWeaveGraph(
        batch_size,
        max_atoms=max_atoms,
        n_atom_feat=n_atom_feat,
        n_pair_feat=n_pair_feat)
    graph.add(dc.nn.AlternateWeaveLayer(max_atoms, 75, 14))
    graph.add(dc.nn.AlternateWeaveLayer(max_atoms, 50, 50, update_pair=False))
    graph.add(dc.nn.Dense(n_feat, 50, activation='tanh'))
    graph.add(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
    graph.add(
        dc.nn.AlternateWeaveGather(
            batch_size, n_input=n_feat, gaussian_expand=True))

    model = dc.models.MultitaskGraphRegressor(
        graph,
        n_tasks,
        n_feat,
        batch_size=batch_size,
        learning_rate=1e-3,
        learning_rate_decay_time=1000,
        optimizer_type="adam",
        beta1=.9,
        beta2=.999)

    # Fit trained model
    model.fit(dataset, nb_epoch=40)
    model.save()

    # Eval model on train
    scores = model.evaluate(dataset, [regression_metric])

    assert scores[regression_metric.name] > .9

  def test_tensorgraph_weave_singletask_regression_overfit(self):
    """Test weave model overfits tiny data."""
    np.random.seed(123)
@@ -1455,226 +1150,6 @@ class TestOverfit(test_util.TensorFlowTestCase):

    assert scores[regression_metric.name] > .9

  def test_siamese_singletask_classification_overfit(self):
    """Test siamese singletask model overfits tiny data."""
    np.random.seed(123)
    tf.set_random_seed(123)
    n_tasks = 1
    n_feat = 75
    max_depth = 4
    n_pos = 6
    n_neg = 4
    test_batch_size = 10
    n_train_trials = 80
    support_batch_size = n_pos + n_neg

    # Load mini log-solubility dataset.
    featurizer = dc.feat.ConvMolFeaturizer()
    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir, "example_classification.csv")
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(input_file)

    classification_metric = dc.metrics.Metric(dc.metrics.accuracy_score)

    support_model = dc.nn.SequentialSupportGraph(n_feat)

    # Add layers
    # output will be (n_atoms, 64)
    support_model.add(dc.nn.GraphConv(64, n_feat, activation='relu'))
    # Need to add batch-norm separately to test/support due to differing
    # shapes.
    # output will be (n_atoms, 64)
    support_model.add_test(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
    # output will be (n_atoms, 64)
    support_model.add_support(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
    support_model.add(dc.nn.GraphPool())
    support_model.add_test(dc.nn.GraphGather(test_batch_size))
    support_model.add_support(dc.nn.GraphGather(support_batch_size))

    model = dc.models.SupportGraphClassifier(
        support_model,
        test_batch_size=test_batch_size,
        support_batch_size=support_batch_size,
        learning_rate=1e-3)

    # Fit trained model. Dataset has 6 positives and 4 negatives, so set
    # n_pos/n_neg accordingly.
    model.fit(
        dataset, n_episodes_per_epoch=n_train_trials, n_pos=n_pos, n_neg=n_neg)
    model.save()

    # Eval model on train. Dataset has 6 positives and 4 negatives, so set
    # n_pos/n_neg accordingly. Note that support is *not* excluded (so we
    # can measure model has memorized support).  Replacement is turned off to
    # ensure that support contains full training set. This checks that the
    # model has mastered memorization of provided support.
    scores, _ = model.evaluate(
        dataset,
        classification_metric,
        n_trials=5,
        n_pos=n_pos,
        n_neg=n_neg,
        exclude_support=False)

    ##################################################### DEBUG
    # TODO(rbharath): Check if something went wrong here...
    # Measure performance on 0-th task.
    #assert scores[0] > .9
    assert scores[0] > .75
    ##################################################### DEBUG

  def test_attn_lstm_singletask_classification_overfit(self):
    """Test attn lstm singletask overfits tiny data."""
    np.random.seed(123)
    tf.set_random_seed(123)
    n_tasks = 1
    n_feat = 75
    max_depth = 4
    n_pos = 6
    n_neg = 4
    test_batch_size = 10
    support_batch_size = n_pos + n_neg
    n_train_trials = 80

    # Load mini log-solubility dataset.
    featurizer = dc.feat.ConvMolFeaturizer()
    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir, "example_classification.csv")
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(input_file)
    classification_metric = dc.metrics.Metric(dc.metrics.accuracy_score)

    support_model = dc.nn.SequentialSupportGraph(n_feat)

    # Add layers
    # output will be (n_atoms, 64)
    support_model.add(dc.nn.GraphConv(64, n_feat, activation='relu'))
    # Need to add batch-norm separately to test/support due to differing
    # shapes.
    # output will be (n_atoms, 64)
    support_model.add_test(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
    # output will be (n_atoms, 64)
    support_model.add_support(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
    support_model.add(dc.nn.GraphPool())
    support_model.add_test(dc.nn.GraphGather(test_batch_size))
    support_model.add_support(dc.nn.GraphGather(support_batch_size))

    # Apply an attention lstm layer
    support_model.join(
        dc.nn.AttnLSTMEmbedding(test_batch_size, support_batch_size, 64,
                                max_depth))

    model = dc.models.SupportGraphClassifier(
        support_model,
        test_batch_size=test_batch_size,
        support_batch_size=support_batch_size,
        learning_rate=1e-3)

    # Fit trained model. Dataset has 6 positives and 4 negatives, so set
    # n_pos/n_neg accordingly.
    model.fit(
        dataset, n_episodes_per_epoch=n_train_trials, n_pos=n_pos, n_neg=n_neg)
    model.save()

    # Eval model on train. Dataset has 6 positives and 4 negatives, so set
    # n_pos/n_neg accordingly. Note that support is *not* excluded (so we
    # can measure model has memorized support).  Replacement is turned off to
    # ensure that support contains full training set. This checks that the
    # model has mastered memorization of provided support.
    scores, _ = model.evaluate(
        dataset,
        classification_metric,
        n_trials=5,
        n_pos=n_pos,
        n_neg=n_neg,
        exclude_support=False)

    # Measure performance on 0-th task.
    ##################################################### DEBUG
    # TODO(rbharath): Check if something went wrong here...
    # Measure performance on 0-th task.
    #assert scores[0] > .85
    assert scores[0] > .79
    ##################################################### DEBUG

  def test_residual_lstm_singletask_classification_overfit(self):
    """Test resi-lstm multitask overfits tiny data."""
    n_tasks = 1
    n_feat = 75
    max_depth = 4
    n_pos = 6
    n_neg = 4
    test_batch_size = 10
    support_batch_size = n_pos + n_neg
    n_train_trials = 80

    # Load mini log-solubility dataset.
    featurizer = dc.feat.ConvMolFeaturizer()
    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir, "example_classification.csv")
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(input_file)

    classification_metric = dc.metrics.Metric(dc.metrics.accuracy_score)

    support_model = dc.nn.SequentialSupportGraph(n_feat)

    # Add layers
    # output will be (n_atoms, 64)
    support_model.add(dc.nn.GraphConv(64, n_feat, activation='relu'))
    # Need to add batch-norm separately to test/support due to differing
    # shapes.
    # output will be (n_atoms, 64)
    support_model.add_test(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
    # output will be (n_atoms, 64)
    support_model.add_support(dc.nn.BatchNormalization(epsilon=1e-5, mode=1))
    support_model.add(dc.nn.GraphPool())
    support_model.add_test(dc.nn.GraphGather(test_batch_size))
    support_model.add_support(dc.nn.GraphGather(support_batch_size))

    # Apply a residual lstm layer
    support_model.join(
        dc.nn.ResiLSTMEmbedding(test_batch_size, support_batch_size, 64,
                                max_depth))

    model = dc.models.SupportGraphClassifier(
        support_model,
        test_batch_size=test_batch_size,
        support_batch_size=support_batch_size,
        learning_rate=1e-3)

    # Fit trained model. Dataset has 6 positives and 4 negatives, so set
    # n_pos/n_neg accordingly.

    model.fit(
        dataset, n_episodes_per_epoch=n_train_trials, n_pos=n_pos, n_neg=n_neg)
    model.save()

    # Eval model on train. Dataset has 6 positives and 4 negatives, so set
    # n_pos/n_neg accordingly. Note that support is *not* excluded (so we
    # can measure model has memorized support).  Replacement is turned off to
    # ensure that support contains full training set. This checks that the
    # model has mastered memorization of provided support.
    scores, _ = model.evaluate(
        dataset,
        classification_metric,
        n_trials=5,
        n_pos=n_pos,
        n_neg=n_neg,
        exclude_support=False)

    # Measure performance on 0-th task.
    ##################################################### DEBUG
    # TODO(rbharath): Check if something went wrong here...
    # Measure performance on 0-th task.
    #assert scores[0] > .9
    assert scores[0] > .65
    ##################################################### DEBUG

  def test_tf_progressive_regression_overfit(self):
    """Test tf progressive multitask overfits tiny data."""
    np.random.seed(123)
+0 −321

File changed.

Preview size limit exceeded, changes collapsed.

+0 −576

File changed.

Preview size limit exceeded, changes collapsed.

+0 −265

File changed.

Preview size limit exceeded, changes collapsed.

Loading