Commit 8a672422 authored by haozhenWu's avatar haozhenWu
Browse files

test_generalize for xgboost

parent 7a05ed2b
Loading
Loading
Loading
Loading
+96 −3
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@ import deepchem as dc
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import xgboost

class TestGeneralize(unittest.TestCase):
  """
@@ -185,3 +186,95 @@ class TestGeneralize(unittest.TestCase):
  #  scores = model.evaluate(test_dataset, [classification_metric])
  #  for score in scores[classification_metric.name]:
  #    assert score > .5

  def test_xgboost_regression(self):
    np.random.seed(123)

    dataset = sklearn.datasets.load_diabetes()
    X, y = dataset.data, dataset.target
    frac_train = .7
    n_samples = len(X)
    n_train = int(frac_train*n_samples)
    X_train, y_train = X[:n_train], y[:n_train]
    X_test, y_test = X[n_train:], y[n_train:]
    train_dataset = dc.data.NumpyDataset(X_train, y_train)
    test_dataset = dc.data.NumpyDataset(X_test, y_test)

    regression_metric = dc.metrics.Metric(dc.metrics.mae_score)
    # Set early stopping round = n_estimators so that esr won't work
    esr = {'early_stopping_rounds' : 50}
    xgb_model = xgboost.XGBRegressor(n_estimators=50,seed=123)
    model = dc.models.XGBoostModel(xgb_model,verbose=False,**esr)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on test
    scores = model.evaluate(test_dataset, [regression_metric])
    assert scores[regression_metric.name] < 50

  def test_xgboost_multitask_regression(self):
    """Test that xgboost models can learn on simple multitask regression."""
    np.random.seed(123)
    n_tasks = 4
    tasks = range(n_tasks)
    dataset = sklearn.datasets.load_diabetes()
    X, y = dataset.data, dataset.target
    y = np.reshape(y, (len(y), 1))
    y = np.hstack([y] * n_tasks)

    frac_train = .7
    n_samples = len(X)
    n_train = int(frac_train*n_samples)
    X_train, y_train = X[:n_train], y[:n_train]
    X_test, y_test = X[n_train:], y[n_train:]
    train_dataset = dc.data.DiskDataset.from_numpy(X_train, y_train)
    test_dataset = dc.data.DiskDataset.from_numpy(X_test, y_test)

    regression_metric = dc.metrics.Metric(dc.metrics.mae_score)
    esr = {'early_stopping_rounds' : 50}
    def model_builder(model_dir):
      xgb_model = xgboost.XGBRegressor(n_estimators=50,seed=123)
      return dc.models.XGBoostModel(xgb_model,
                                    model_dir,
                                    verbose=False,
                                    **esr)

    model = dc.models.SingletaskToMultitask(tasks, model_builder)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on test
    scores = model.evaluate(test_dataset, [regression_metric])
    for score in scores[regression_metric.name]:
      assert score < 50

  def test_xgboost_classification(self):
    """Test that sklearn models can learn on simple classification datasets."""
    np.random.seed(123)
    dataset = sklearn.datasets.load_digits(n_class=2)
    X, y = dataset.data, dataset.target

    frac_train = .7
    n_samples = len(X)
    n_train = int(frac_train*n_samples)
    X_train, y_train = X[:n_train], y[:n_train]
    X_test, y_test = X[n_train:], y[n_train:]
    train_dataset = dc.data.NumpyDataset(X_train, y_train)
    test_dataset = dc.data.NumpyDataset(X_test, y_test)

    classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
    esr = {'early_stopping_rounds' : 50}
    xgb_model = xgboost.XGBClassifier(n_estimators=50,seed=123)
    model = dc.models.XGBoostModel(xgb_model,verbose=False,**esr)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on test
    scores = model.evaluate(test_dataset, [classification_metric])
    assert scores[classification_metric.name] > .9