Commit 05fb2bd7 authored by nd-02110114's avatar nd-02110114
Browse files

add reload test

parent 9829acd6
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -60,6 +60,8 @@ class GBDTModel(SklearnModel):
        self.eval_metric: Union[str, Callable[..., Tuple]] = 'auc'
      elif self.model_type == 'regression':
        self.eval_metric = 'mae'
      else:
        self.eval_metric = None
    else:
      self.eval_metric = eval_metric

@@ -69,6 +71,8 @@ class GBDTModel(SklearnModel):
      return 'classification'
    elif class_name.endswith('Regressor'):
      return 'regression'
    elif class_name == 'NoneType':
      return None
    else:
      raise ValueError(
          '{} is not a supported model instance.'.format(class_name))
+2 −2
Original line number Diff line number Diff line
@@ -46,7 +46,6 @@ class SklearnModel(Model):
  def __init__(self,
               model: BaseEstimator,
               model_dir: Optional[str] = None,
               model_instance: Optional[BaseEstimator] = None,
               **kwargs):
    """
    Parameters
@@ -62,7 +61,8 @@ class SklearnModel(Model):
      kwargs['use_weights'] is a bool which determines if we pass weights into
      self.model.fit().
    """
    if model_instance is not None:
    if 'model_instance' in kwargs:
      model_instance = kwargs['model_instance']
      if model is not None:
        raise ValueError(
            "Can not use both model and model_instance argument at the same time."
+106 −121
Original line number Diff line number Diff line
@@ -2,197 +2,182 @@
Tests to make sure deepchem models can fit models on easy datasets.
"""

import sklearn
import sklearn.datasets
import tempfile

import numpy as np
import deepchem as dc
import xgboost
import lightgbm
from sklearn.datasets import load_diabetes, load_digits
from sklearn.model_selection import train_test_split

import deepchem as dc

def test_xgboost_regression():

def test_signletask_regression():
  np.random.seed(123)

  dataset = sklearn.datasets.load_diabetes()
  # prepare dataset
  dataset = load_diabetes()
  X, y = dataset.data, dataset.target
  frac_train = .7
  n_samples = len(X)
  n_train = int(frac_train * n_samples)
  X_train, y_train = X[:n_train], y[:n_train]
  X_test, y_test = X[n_train:], y[n_train:]
  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=frac_train)
  train_dataset = dc.data.NumpyDataset(X_train, y_train)
  test_dataset = dc.data.NumpyDataset(X_test, y_test)

  # global setting
  regression_metric = dc.metrics.Metric(dc.metrics.mae_score)
  # Set early stopping round = n_estimators so that esr won't work
  esr = {'early_stopping_rounds': 50}

  xgb_model = xgboost.XGBRegressor(
      n_estimators=50, random_state=123, verbose=False)
  model = dc.models.GBDTModel(xgb_model, **esr)
  params = {'early_stopping_rounds': 25}

  # Fit trained model
  # xgboost test
  xgb_model = xgboost.XGBRegressor(n_estimators=50, random_state=123, verbose=False)
  model = dc.models.GBDTModel(xgb_model, **params)
  # fit trained model
  model.fit(train_dataset)
  model.save()
  # eval model on test
  scores = model.evaluate(test_dataset, [regression_metric])
  assert scores[regression_metric.name] < 55

  # Eval model on test
  # lightgbm test
  lgbm_model = lightgbm.LGBMRegressor(n_estimators=50, random_state=123, silent=True)
  model = dc.models.GBDTModel(lgbm_model, **params)
  # fit trained model
  model.fit(train_dataset)
  model.save()
  # eval model on test
  scores = model.evaluate(test_dataset, [regression_metric])
  assert scores[regression_metric.name] < 55


def test_xgboost_multitask_regression():
def test_multitask_regression():
  np.random.seed(123)

  # prepare dataset
  n_tasks = 4
  tasks = range(n_tasks)
  dataset = sklearn.datasets.load_diabetes()
  dataset = load_diabetes()
  X, y = dataset.data, dataset.target
  y = np.reshape(y, (len(y), 1))
  y = np.hstack([y] * n_tasks)

  frac_train = .7
  n_samples = len(X)
  n_train = int(frac_train * n_samples)
  X_train, y_train = X[:n_train], y[:n_train]
  X_test, y_test = X[n_train:], y[n_train:]
  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=frac_train)
  train_dataset = dc.data.DiskDataset.from_numpy(X_train, y_train)
  test_dataset = dc.data.DiskDataset.from_numpy(X_test, y_test)

  # global setting
  regression_metric = dc.metrics.Metric(dc.metrics.mae_score)
  esr = {'early_stopping_rounds': 50}
  params = {'early_stopping_rounds': 25}

  def model_builder(model_dir):
  # xgboost test
  def xgboost_builder(model_dir):
    xgb_model = xgboost.XGBRegressor(n_estimators=50, seed=123, verbose=False)
    return dc.models.GBDTModel(xgb_model, model_dir, **esr)

  model = dc.models.SingletaskToMultitask(tasks, model_builder)

  # Fit trained model
    return dc.models.GBDTModel(xgb_model, model_dir, **params)
  model = dc.models.SingletaskToMultitask(tasks, xgboost_builder)
  # fit trained model
  model.fit(train_dataset)
  model.save()
  # eval model on test
  scores = model.evaluate(test_dataset, [regression_metric])
  score = scores[regression_metric.name]
  assert score < 55

  # Eval model on test
  # lightgbm test
  def lightgbm_builder(model_dir):
    xgb_model = lightgbm.LGBMRegressor(n_estimators=50, seed=123, silent=False)
    return dc.models.GBDTModel(xgb_model, model_dir, **params)
  model = dc.models.SingletaskToMultitask(tasks, lightgbm_builder)
  # fit trained model
  model.fit(train_dataset)
  model.save()
  # eval model on test
  scores = model.evaluate(test_dataset, [regression_metric])
  score = scores[regression_metric.name]
  assert score < 55


def test_xgboost_classification():
def test_classification():
  """Test that sklearn models can learn on simple classification datasets."""
  np.random.seed(123)
  dataset = sklearn.datasets.load_digits(n_class=2)
  X, y = dataset.data, dataset.target

  # prepare dataset
  dataset = load_digits(n_class=2)
  X, y = dataset.data, dataset.target
  frac_train = .7
  n_samples = len(X)
  n_train = int(frac_train * n_samples)
  X_train, y_train = X[:n_train], y[:n_train]
  X_test, y_test = X[n_train:], y[n_train:]
  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=frac_train)
  train_dataset = dc.data.NumpyDataset(X_train, y_train)
  test_dataset = dc.data.NumpyDataset(X_test, y_test)

  # global setting
  classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
  esr = {'early_stopping_rounds': 50}
  xgb_model = xgboost.XGBClassifier(n_estimators=50, seed=123, verbose=False)
  model = dc.models.GBDTModel(xgb_model, **esr)
  params = {'early_stopping_rounds': 25}

  # Fit trained model
  # xgboost test
  xgb_model = xgboost.XGBClassifier(n_estimators=50, seed=123, verbose=False)
  model = dc.models.GBDTModel(xgb_model, **params)
  # fit trained model
  model.fit(train_dataset)
  model.save()
  # eval model on test
  scores = model.evaluate(test_dataset, [classification_metric])
  assert scores[classification_metric.name] > .9

  # Eval model on test
  # xgboost test
  lgbm_model = lightgbm.LGBMClassifier(n_estimators=50, seed=123, silent=True)
  model = dc.models.GBDTModel(lgbm_model, **params)
  # fit trained model
  model.fit(train_dataset)
  model.save()
  # eval model on test
  scores = model.evaluate(test_dataset, [classification_metric])
  assert scores[classification_metric.name] > .9


def test_lightgbm_regression():
def test_reload():
  np.random.seed(123)

  dataset = sklearn.datasets.load_diabetes()
  # prepare dataset
  dataset = load_diabetes()
  X, y = dataset.data, dataset.target
  frac_train = .7
  n_samples = len(X)
  n_train = int(frac_train * n_samples)
  X_train, y_train = X[:n_train], y[:n_train]
  X_test, y_test = X[n_train:], y[n_train:]
  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=frac_train)
  train_dataset = dc.data.NumpyDataset(X_train, y_train)
  test_dataset = dc.data.NumpyDataset(X_test, y_test)

  # global setting
  regression_metric = dc.metrics.Metric(dc.metrics.mae_score)
  # Set early stopping round = n_estimators so that esr won't work
  esr = {'early_stopping_rounds': 50}
  model_dir = tempfile.mkdtemp()
  params = {'early_stopping_rounds': 25, 'model_dir': model_dir}

  lgbm_model = lightgbm.LGBMRegressor(
      n_estimators=50, random_state=123, silent=True)
  model = dc.models.GBDTModel(lgbm_model, **esr)

  # Fit trained model
  # xgboost test
  xgb_model = xgboost.XGBRegressor(n_estimators=50, random_state=123, verbose=False)
  model = dc.models.GBDTModel(xgb_model, **params)
  # fit trained model
  model.fit(train_dataset)
  model.save()

  # Eval model on test
  scores = model.evaluate(test_dataset, [regression_metric])
  # reload
  reloaded_model = dc.models.GBDTModel(None, model_dir)
  reloaded_model.reload()
  # check predictions match on test dataset
  original_pred = model.predict(test_dataset)
  reload_pred = reloaded_model.predict(test_dataset)
  assert np.all(original_pred == reload_pred)
  # eval model on test
  scores = reloaded_model.evaluate(test_dataset, [regression_metric])
  assert scores[regression_metric.name] < 55


def test_lightgbm_multitask_regression():
  np.random.seed(123)
  n_tasks = 4
  tasks = range(n_tasks)
  dataset = sklearn.datasets.load_diabetes()
  X, y = dataset.data, dataset.target
  y = np.reshape(y, (len(y), 1))
  y = np.hstack([y] * n_tasks)

  frac_train = .7
  n_samples = len(X)
  n_train = int(frac_train * n_samples)
  X_train, y_train = X[:n_train], y[:n_train]
  X_test, y_test = X[n_train:], y[n_train:]
  train_dataset = dc.data.DiskDataset.from_numpy(X_train, y_train)
  test_dataset = dc.data.DiskDataset.from_numpy(X_test, y_test)

  regression_metric = dc.metrics.Metric(dc.metrics.mae_score)
  esr = {'early_stopping_rounds': 50}

  def model_builder(model_dir):
    lgbm_model = lightgbm.LGBMRegressor(n_estimators=50, seed=123, silent=True)
    return dc.models.GBDTModel(lgbm_model, model_dir, **esr)

  model = dc.models.SingletaskToMultitask(tasks, model_builder)

  # Fit trained model
  model.fit(train_dataset)
  model.save()

  # Eval model on test
  scores = model.evaluate(test_dataset, [regression_metric])
  score = scores[regression_metric.name]
  assert score < 55


def test_lightgbm_classification():
  """Test that sklearn models can learn on simple classification datasets."""
  np.random.seed(123)
  dataset = sklearn.datasets.load_digits(n_class=2)
  X, y = dataset.data, dataset.target

  frac_train = .7
  n_samples = len(X)
  n_train = int(frac_train * n_samples)
  X_train, y_train = X[:n_train], y[:n_train]
  X_test, y_test = X[n_train:], y[n_train:]
  train_dataset = dc.data.NumpyDataset(X_train, y_train)
  test_dataset = dc.data.NumpyDataset(X_test, y_test)

  classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
  esr = {'early_stopping_rounds': 50}
  lgbm_model = lightgbm.LGBMClassifier(n_estimators=50, seed=123, silent=True)
  model = dc.models.GBDTModel(lgbm_model, **esr)

  # Fit trained model
  # lightgbm test
  lgbm_model = lightgbm.LGBMRegressor(n_estimators=50, random_state=123, silent=True)
  model = dc.models.GBDTModel(lgbm_model, **params)
  # fit trained model
  model.fit(train_dataset)
  model.save()

  # Eval model on test
  scores = model.evaluate(test_dataset, [classification_metric])
  assert scores[classification_metric.name] > .9
  # reload
  reloaded_model = dc.models.GBDTModel(None, model_dir)
  reloaded_model.reload()
  # check predictions match on test dataset
  original_pred = model.predict(test_dataset)
  reload_pred = reloaded_model.predict(test_dataset)
  assert np.all(original_pred == reload_pred)
  # eval model on test
  scores = reloaded_model.evaluate(test_dataset, [regression_metric])
  assert scores[regression_metric.name] < 55