Commit 6c4008c8 authored by Ubuntu's avatar Ubuntu
Browse files

run yapf

parent 8a672422
Loading
Loading
Loading
Loading
+23 −22
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@ from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import xgboost


class TestGeneralize(unittest.TestCase):
  """
  Test that models can learn generalizable models on simple datasets.
@@ -71,10 +72,10 @@ class TestGeneralize(unittest.TestCase):
    transformers = [
        dc.trans.NormalizationTransformer(
            transform_X=True, dataset=train_dataset),
        dc.trans.ClippingTransformer(
            transform_X=True, dataset=train_dataset),
        dc.trans.ClippingTransformer(transform_X=True, dataset=train_dataset),
        dc.trans.NormalizationTransformer(
            transform_y=True, dataset=train_dataset)]
            transform_y=True, dataset=train_dataset)
    ]
    for data in [train_dataset, test_dataset]:
      for transformer in transformers:
        data = transformer.transform(data)
@@ -115,9 +116,11 @@ class TestGeneralize(unittest.TestCase):
    test_dataset = dc.data.DiskDataset.from_numpy(X_test, y_test)

    regression_metric = dc.metrics.Metric(dc.metrics.r2_score)

    def model_builder(model_dir):
      sklearn_model = LinearRegression()
      return dc.models.SklearnModel(sklearn_model, model_dir)

    model = dc.models.SingletaskToMultitask(tasks, model_builder)

    # Fit trained model
@@ -234,12 +237,10 @@ class TestGeneralize(unittest.TestCase):

    regression_metric = dc.metrics.Metric(dc.metrics.mae_score)
    esr = {'early_stopping_rounds': 50}

    def model_builder(model_dir):
      xgb_model = xgboost.XGBRegressor(n_estimators=50, seed=123)
      return dc.models.XGBoostModel(xgb_model,
                                    model_dir,
                                    verbose=False,
                                    **esr)
      return dc.models.XGBoostModel(xgb_model, model_dir, verbose=False, **esr)

    model = dc.models.SingletaskToMultitask(tasks, model_builder)

+51 −48
Original line number Diff line number Diff line
@@ -13,12 +13,17 @@ from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
import tempfile


class XGBoostModel(SklearnModel):
  """
  Abstract base class for XGBoost model.
  """
  def __init__(self, model_instance=None, model_dir=None,
               verbose=True, **kwargs):

  def __init__(self,
               model_instance=None,
               model_dir=None,
               verbose=True,
               **kwargs):
    """Abstract class for XGBoost models.
    Parameters:
    -----------
@@ -42,7 +47,6 @@ class XGBoostModel(SklearnModel):
    else:
      self.early_stopping_rounds = 50


  def fit(self, dataset, **kwargs):
    """
    Fits XGBoost model to data.
@@ -65,12 +69,12 @@ class XGBoostModel(SklearnModel):

    # Find optimal n_estimators based on original learning_rate
    # and early_stopping_rounds
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size = 0.2,
                                                        random_state=seed,
                                                        stratify=stratify)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=stratify)

    self.model_instance.fit(X_train, y_train,
    self.model_instance.fit(
        X_train,
        y_train,
        early_stopping_rounds=self.early_stopping_rounds,
        eval_metric=xgb_metric,
        eval_set=[(X_train, y_train), (X_test, y_test)],
@@ -79,8 +83,8 @@ class XGBoostModel(SklearnModel):
    # n_estimator increased to 1/0.8 = 1.25 time.
    estimated_best_round = np.round(self.model_instance.best_ntree_limit * 1.25)
    self.model_instance.n_estimators = np.int64(estimated_best_round)
    self.model_instance.fit(X_train, y_train, eval_metric=xgb_metric,
			    verbose=self.verbose)
    self.model_instance.fit(
        X_train, y_train, eval_metric=xgb_metric, verbose=self.verbose)

  def _search_param(self, metric, X, y):
    '''
@@ -88,10 +92,9 @@ class XGBoostModel(SklearnModel):
    '''
    # Make sure user specified params are in the grid.
    max_depth_grid = list(np.unique([self.model_instance.max_depth, 5, 7]))
    colsample_bytree_grid = list(np.unique(
                                [self.model_instance.colsample_bytree,0.66,0.9]))
    reg_lambda_grid = list(np.unique(
                                [self.model_instance.reg_lambda,1,5]))
    colsample_bytree_grid = list(
        np.unique([self.model_instance.colsample_bytree, 0.66, 0.9]))
    reg_lambda_grid = list(np.unique([self.model_instance.reg_lambda, 1, 5]))
    param_grid = {
        'max_depth': max_depth_grid,
        'learning_rate': [max(self.model_instance.learning_rate, 0.3)],
@@ -108,8 +111,8 @@ class XGBoostModel(SklearnModel):
        'base_score': [self.model_instance.base_score],
        'seed': [self.model_instance.seed]
    }
    grid_search = GridSearchCV(self.model_instance, param_grid, cv=2,
                                refit=False, scoring=metric)
    grid_search = GridSearchCV(
        self.model_instance, param_grid, cv=2, refit=False, scoring=metric)
    grid_search.fit(X, y)
    best_params = grid_search.best_params_
    # Change params back original params
+58 −49
Original line number Diff line number Diff line
@@ -58,6 +58,7 @@ from clintox.clintox_datasets import load_clintox
from hiv.hiv_datasets import load_hiv
import xgboost


def benchmark_loading_datasets(hyper_parameters,
                               dataset='tox21',
                               model='tf',
@@ -313,8 +314,9 @@ def benchmark_classification(train_dataset,
  if metric == 'auc':
    classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)

  assert model in ['rf', 'tf', 'tf_robust', 'logreg', 'irv', 'graphconv',
                   'xgb_classifier']
  assert model in [
      'rf', 'tf', 'tf_robust', 'logreg', 'irv', 'graphconv', 'xgb_classifier'
  ]

  if model == 'tf':
    # Loading hyper parameters
@@ -584,6 +586,7 @@ def benchmark_classification(train_dataset,
    early_stopping_rounds = hyper_parameters['early_stopping_rounds']

    esr = {'early_stopping_rounds': early_stopping_rounds}

    # Building xgboost classification model
    def model_builder(model_dir_xgb):
      xgboost_model = xgboost.XGBClassifier(
@@ -601,9 +604,9 @@ def benchmark_classification(train_dataset,
          scale_pos_weight=scale_pos_weight,
          base_score=base_score,
          seed=seed)
        return dc.models.xgboost_models.XGBoostModel(xgboost_model,
                                                            model_dir_xgb,
      return dc.models.xgboost_models.XGBoostModel(xgboost_model, model_dir_xgb,
                                                   **esr)

    model_xgb = dc.models.multitask.SingletaskToMultitask(tasks, model_builder)

    print('-------------------------------------')
@@ -680,8 +683,9 @@ def benchmark_regression(train_dataset,
    regression_metric = dc.metrics.Metric(dc.metrics.mean_absolute_error,
                                          np.mean)

  assert model in ['tf_regression', 'rf_regression', 'graphconvreg',
                   'xgb_regression']
  assert model in [
      'tf_regression', 'rf_regression', 'graphconvreg', 'xgb_regression'
  ]

  if model == 'tf_regression':
    # Loading hyper parameters
@@ -843,6 +847,7 @@ def benchmark_regression(train_dataset,
    early_stopping_rounds = hyper_parameters['early_stopping_rounds']

    esr = {'early_stopping_rounds': early_stopping_rounds}

    # Building xgboost classification model
    def model_builder(model_dir_xgb):
      xgboost_model = xgboost.XGBRegressor(
@@ -860,9 +865,9 @@ def benchmark_regression(train_dataset,
          scale_pos_weight=scale_pos_weight,
          base_score=base_score,
          seed=seed)
        return dc.models.xgboost_models.XGBoostModel(xgboost_model,
                                                            model_dir_xgb,
      return dc.models.xgboost_models.XGBoostModel(xgboost_model, model_dir_xgb,
                                                   **esr)

    model_xgb = dc.models.multitask.SingletaskToMultitask(tasks, model_builder)

    print('-------------------------------------')
@@ -902,7 +907,8 @@ if __name__ == '__main__':
      dest='model_args',
      default=[],
      help='Choice of model: tf, tf_robust, logreg, rf, irv, graphconv, ' +
      'tf_regression, rf_regression, graphconvreg, xgb_classifier, xgb_regression')
      'tf_regression, rf_regression, graphconvreg, xgb_classifier, xgb_regression'
  )
  parser.add_argument(
      '-d',
      action='append',
@@ -1062,8 +1068,10 @@ if __name__ == '__main__':
          'tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'hiv'
      ]:
        for model in models:
          if model in ['tf', 'tf_robust', 'logreg', 'graphconv', 'rf', 'irv',
                        'xgb_classifier']:
          if model in [
              'tf', 'tf_robust', 'logreg', 'graphconv', 'rf', 'irv',
              'xgb_classifier'
          ]:
            benchmark_loading_datasets(
                hps,
                dataset=dataset,
@@ -1073,8 +1081,9 @@ if __name__ == '__main__':
                test=test)
      else:
        for model in models:
          if model in ['tf_regression', 'rf_regression', 'graphconvreg',
                        'xgb_regression']:
          if model in [
              'tf_regression', 'rf_regression', 'graphconvreg', 'xgb_regression'
          ]:
            benchmark_loading_datasets(
                hps,
                dataset=dataset,