Commit 20b0a55b authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Refactoring gaussian process optimizer.

parent ec91d498
Loading
Loading
Loading
Loading
+122 −1
Original line number Diff line number Diff line

def compute_parameter_search_space(params_dict, search_range):
  """Convenience Function to compute parameter search space.

  Parameters
  ----------
  params_dict: dict
    Dictionary mapping strings to Ints/Floats/Lists. For those
    parameters in which int/float is specified, an explicit list of
    parameters is computed with `search_range`. Parameters in `hp_invalid_list`
  search_range: int(float) (default 4)
    For int/float values in `params_dict`, computes optimization range
    on `[initial values / search_range, initial values *
    search_range]`

  Returns
  -------
  expanded_params: dict
    Expanded dictionary of parameters where all int/float values in
    `params_dict` are expanded out into explicit search ranges.
  """
  hyper_parameters = params_dict
  hp_list = list(hyper_parameters.keys())

  for hp in hp_invalid_list:
    if hp in hp_list:
      hp_list.remove(hp)

  hp_list_class = [hyper_parameters[hp].__class__ for hp in hp_list]
  # Check the type is correct
  if not (set(hp_list_class) <= set([list, int, float])):
    raise ValueError("params_dict must contain values that are lists/ints/floats.")

  # Float or int hyper parameters(ex. batch_size, learning_rate)
  hp_list_single = [
      hp_list[i] for i in range(len(hp_list)) if not hp_list_class[i] is list
  ]

  # List of float or int hyper parameters(ex. layer_sizes)
  hp_list_multiple = [(hp_list[i], len(hyper_parameters[hp_list[i]]))
                      for i in range(len(hp_list))
                      if hp_list_class[i] is list]

  # Range of optimization
  param_range = []
  for hp in hp_list_single:
    if hyper_parameters[hp].__class__ is int:
      param_range.append((('int'), [
          hyper_parameters[hp] // search_range,
          hyper_parameters[hp] * search_range
      ]))
    else:
      param_range.append((('cont'), [
          hyper_parameters[hp] / search_range,
          hyper_parameters[hp] * search_range
      ]))
  for hp in hp_list_multiple:
    if hyper_parameters[hp[0]][0].__class__ is int:
      param_range.extend([(('int'), [
          hyper_parameters[hp[0]][i] // search_range,
          hyper_parameters[hp[0]][i] * search_range
      ]) for i in range(hp[1])])
    else:
      param_range.extend([(('cont'), [
          hyper_parameters[hp[0]][i] / search_range,
          hyper_parameters[hp[0]][i] * search_range
      ]) for i in range(hp[1])])

class HyperparamOpt(object):
  """Abstract superclass for hyperparameter search classes.

@@ -13,9 +81,14 @@ class HyperparamOpt(object):
  strategy for searching the hyperparameter evaluation space. This
  class itself is an abstract superclass and should never be directly
  instantiated.

  Objects of this class maintains a list of constants,
  `hp_invalid_list` that contains a list of model parameters which
  cannot be optimized over This list is used to catch user errors. You
  can customize this list in the constructor.
  """

  def __init__(self, model_class):
  def __init__(self, model_class, hp_invalid_list=['seed', 'nb_epoch', 'penalty_type', 'dropouts', 'bypass_dropouts', 'n_pair_feat', 'fit_transformers', 'min_child_weight', 'max_delta_step', 'subsample', 'colsample_bylevel', 'colsample_bytree', 'reg_alpha', 'reg_lambda', 'scale_pos_weight', 'base_score']):
    """Initialize Hyperparameter Optimizer.

    Note this is an abstract constructor which should only be used by
@@ -39,9 +112,57 @@ class HyperparamOpt(object):
      must accept two arguments, `model_params` of type `dict` and
      `model_dir`, a string specifying a path to a model directory.
      See the example.
    hp_invalid_list: list, (default `['seed', 'nb_epoch', 'penalty_type', 'dropouts', 'bypass_dropouts', 'n_pair_feat', 'fit_transformers', 'min_child_weight', 'max_delta_step', 'subsample', 'colsample_bylevel', 'colsample_bytree', 'reg_alpha', 'reg_lambda', 'scale_pos_weight', 'base_score']`)
    """
    if self.__class__.__name__ == "HyperparamOpt":
      raise ValueError(
          "HyperparamOpt is an abstract superclass and cannot be directly instantiated. You probably want to instantiate a concrete subclass instead."
      )
    self.model_class = model_class
    self.hp_invalid_list = hp_invalid_list

  def hyperparam_search(self,
                        params_dict,
                        train_dataset,
                        valid_dataset,
                        transformers,
                        metric,
                        use_max=True,
                        logdir=None):
    """Conduct Hyperparameter search.

    This method defines the common API shared by all hyperparameter
    optimization subclasses. Different classes will implement
    different search methods but they must all follow this common API.

    Parameters
    ----------
    params_dict: dict
      Dictionary mapping strings to Ints/Floats/Lists. For those
      parameters in which int/float is specified, an explicit list of
      parameters is computed with `search_range`.
    train_dataset: `dc.data.Dataset`
      dataset used for training
    valid_dataset: `dc.data.Dataset`
      dataset used for validation(optimization on valid scores)
    output_transformers: list[dc.trans.Transformer]
      Transformers for evaluation. This argument is needed since
      `train_dataset` and `valid_dataset` may have been transformed
      for learning and need the transform to be inverted before
      the metric can be evaluated on a model.
    use_max: bool, optional
      If True, return the model with the highest score. Else return
      model with the minimum score.
    logdir: str, optional
      The directory in which to store created models. If not set, will
      use a temporary directory.

    Returns
    -------
    `(best_model, best_hyperparams, all_scores)` where `best_model` is
    an instance of `dc.model.Models`, `best_hyperparams` is a
    dictionary of parameters, and `all_scores` is a dictionary mapping
    string representations of hyperparameter sets to validation
    scores.
    """
    raise NotImplementedError
+67 −73
Original line number Diff line number Diff line
@@ -16,6 +16,13 @@ logger = logging.getLogger(__name__)
class GaussianProcessHyperparamOpt(HyperparamOpt):
  """
  Gaussian Process Global Optimization(GPGO)

  This class uses Gaussian Process optimization to select
  hyperparameters. Note that this class can only optimize 20
  parameters at a time.

  TODO: This class is too tied up with the MoleculeNet benchmarking.
  This needs to be refactored out cleanly.
  """

  def hyperparam_search(
@@ -23,46 +30,34 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
      params_dict,
      train_dataset,
      valid_dataset,
      output_transformers,
      transformers,
      metric,
      direction=True,
      use_max=True,
      logdir=None,
      n_features=1024,
      n_tasks=1,
      max_iter=20,
      search_range=4,
      hp_invalid_list=[
          'seed', 'nb_epoch', 'penalty_type', 'dropouts', 'bypass_dropouts',
          'n_pair_feat', 'fit_transformers', 'min_child_weight',
          'max_delta_step', 'subsample', 'colsample_bylevel',
          'colsample_bytree', 'reg_alpha', 'reg_lambda', 'scale_pos_weight',
          'base_score'
      ],
      log_file='GPhypersearch.log'):
    """Perform hyperparams search using a gaussian process assumption

    `params_dict` should map names of parameters being optimized to a
    list of parameter values, which should only contain int, float and
    list of int(float). Parameters with names in hp_invalid_list will
    not be changed/

    For Molnet models, self.model_class is model name in string,
    params_dict = dc.molnet.preset_hyper_parameters.hps[self.model_class]
    """Perform hyperparameter search using a gaussian process.

    Parameters
    ----------
    params_dict: dict
      dict including parameters and their initial values
      parameters not suitable for optimization can be added to hp_invalid_list
    train_dataset: `dc.data.Dataset`
      dataset used for training
    valid_dataset: `dc.data.Dataset`
      dataset used for validation(optimization on valid scores)
    output_transformers: list[dc.trans.Transformer]
    transformers: list[dc.trans.Transformer]
      transformers for evaluation
    metric: `dc.metrics.Metric`
      metric used for evaluation
    direction: bool, (default True)
    use_max: bool, (default True)
      maximization(True) or minimization(False)
    logdir: str, optional
      The directory in which to store created models. If not set, will
      use a temporary directory.
    n_features: int, (default 1024)
      number of input features
    n_tasks: int, (default 1)
@@ -72,7 +67,6 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
    search_range: int(float) (default 4)
      optimization on [initial values / search_range,
                       initial values * search_range]
    hp_invalid_list: list, (default `['seed', 'nb_epoch', 'penalty_type', 'dropouts', 'bypass_dropouts', 'n_pair_feat', 'fit_transformers', 'min_child_weight', 'max_delta_step', 'subsample', 'colsample_bylevel', 'colsample_bytree', 'reg_alpha', 'reg_lambda', 'scale_pos_weight', 'base_score']`)
      names of parameters that should not be optimized
    logfile: string
      name of log file, hyperparameters and results for each trial
@@ -80,56 +74,55 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):

    Returns
    -------
    hyper_parameters: dict
      params_dict with all optimized values
    valid_performance_opt: float
      best performance on valid dataset
    `(best_model, best_hyperparams, all_scores)` where `best_model` is
    an instance of `dc.model.Models`, `best_hyperparams` is a
    dictionary of parameters, and `all_scores` is a dictionary mapping
    string representations of hyperparameter sets to validation
    scores.
    """
    hyper_parameters = params_dict
    hp_list = list(hyper_parameters.keys())
    for hp in hp_invalid_list:
      if hp in hp_list:
        hp_list.remove(hp)

    hp_list_class = [hyper_parameters[hp].__class__ for hp in hp_list]
    assert set(hp_list_class) <= set([list, int, float])
    # Float or int hyper parameters(ex. batch_size, learning_rate)
    hp_list_single = [
        hp_list[i] for i in range(len(hp_list)) if not hp_list_class[i] is list
    ]
    # List of float or int hyper parameters(ex. layer_sizes)
    hp_list_multiple = [(hp_list[i], len(hyper_parameters[hp_list[i]]))
                        for i in range(len(hp_list))
                        if hp_list_class[i] is list]
    if len(params_dict) > 20:
      raise ValueError("This class can only search over 20 parameters in one invocation.")
    #hyper_parameters = params_dict
    #hp_list = list(hyper_parameters.keys())
    #hp_list_class = [hyper_parameters[hp].__class__ for hp in hp_list]
    #assert set(hp_list_class) <= set([list, int, float])
    ## Float or int hyper parameters(ex. batch_size, learning_rate)
    #hp_list_single = [
    #    hp_list[i] for i in range(len(hp_list)) if not hp_list_class[i] is list
    #]
    ## List of float or int hyper parameters(ex. layer_sizes)
    #hp_list_multiple = [(hp_list[i], len(hyper_parameters[hp_list[i]]))
    #                    for i in range(len(hp_list))
    #                    if hp_list_class[i] is list]

    # Number of parameters
    n_param = len(hp_list_single)
    if len(hp_list_multiple) > 0:
      n_param = n_param + sum([hp[1] for hp in hp_list_multiple])
    # Range of optimization
    param_range = []
    for hp in hp_list_single:
      if hyper_parameters[hp].__class__ is int:
        param_range.append((('int'), [
            hyper_parameters[hp] // search_range,
            hyper_parameters[hp] * search_range
        ]))
      else:
        param_range.append((('cont'), [
            hyper_parameters[hp] / search_range,
            hyper_parameters[hp] * search_range
        ]))
    for hp in hp_list_multiple:
      if hyper_parameters[hp[0]][0].__class__ is int:
        param_range.extend([(('int'), [
            hyper_parameters[hp[0]][i] // search_range,
            hyper_parameters[hp[0]][i] * search_range
        ]) for i in range(hp[1])])
      else:
        param_range.extend([(('cont'), [
            hyper_parameters[hp[0]][i] / search_range,
            hyper_parameters[hp[0]][i] * search_range
        ]) for i in range(hp[1])])
    ## Range of optimization
    #param_range = []
    #for hp in hp_list_single:
    #  if hyper_parameters[hp].__class__ is int:
    #    param_range.append((('int'), [
    #        hyper_parameters[hp] // search_range,
    #        hyper_parameters[hp] * search_range
    #    ]))
    #  else:
    #    param_range.append((('cont'), [
    #        hyper_parameters[hp] / search_range,
    #        hyper_parameters[hp] * search_range
    #    ]))
    #for hp in hp_list_multiple:
    #  if hyper_parameters[hp[0]][0].__class__ is int:
    #    param_range.extend([(('int'), [
    #        hyper_parameters[hp[0]][i] // search_range,
    #        hyper_parameters[hp[0]][i] * search_range
    #    ]) for i in range(hp[1])])
    #  else:
    #    param_range.extend([(('cont'), [
    #        hyper_parameters[hp[0]][i] / search_range,
    #        hyper_parameters[hp[0]][i] * search_range
    #    ]) for i in range(hp[1])])

    # Dummy names
    param_name = ['l' + format(i, '02d') for i in range(20)]
@@ -159,6 +152,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
          l18=0,
          l19=0):
      """ Optimizing function

      Take in hyper parameter values and return valid set performances

      Parameters
@@ -200,7 +194,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
              train_dataset,
              valid_dataset,
              valid_dataset, ['task_placeholder'] * n_tasks,
              output_transformers,
              transformers,
              n_features,
              metric,
              self.model_class,
@@ -210,7 +204,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
              train_dataset,
              valid_dataset,
              valid_dataset, ['task_placeholder'] * n_tasks,
              output_transformers,
              transformers,
              n_features,
              metric,
              self.model_class,
@@ -221,7 +215,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
        model = self.model_class(hyper_parameters, model_dir)
        model.fit(train_dataset, **hyper_parameters)
        model.save()
        evaluator = Evaluator(model, valid_dataset, output_transformers)
        evaluator = Evaluator(model, valid_dataset, transformers)
        multitask_scores = evaluator.compute_model_performance([metric])
        score = multitask_scores[metric.name]

@@ -230,7 +224,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
        f.write(str(score))
        f.write('\n')
      # GPGO maximize performance by default, set performance to its negative value for minimization
      if direction:
      if use_max:
        return score
      else:
        return -score
@@ -274,7 +268,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
            train_dataset,
            valid_dataset,
            valid_dataset, ['task_placeholder'] * n_tasks,
            output_transformers,
            transformers,
            n_features,
            metric,
            self.model_class,
@@ -284,7 +278,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
            train_dataset,
            valid_dataset,
            valid_dataset, ['task_placeholder'] * n_tasks,
            output_transformers,
            transformers,
            n_features,
            metric,
            self.model_class,
@@ -294,7 +288,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
        # Record performances
        f.write(str(score))
        f.write('\n')
      if not direction:
      if not use_max:
        score = -score
      if score > valid_performance_opt:
        # Optimized model is better, return hyperparameters
+4 −3
Original line number Diff line number Diff line
@@ -39,13 +39,14 @@ class GridHyperparamOpt(HyperparamOpt):

    Parameters
    ----------
    params_dict: dict
      dict including parameters and their initial values.
    params_dict: Dict[str, list]
      Maps hyperparameter names (strings) to lists of possible
      parameter values.
    train_dataset: `dc.data.Dataset`
      dataset used for training
    valid_dataset: `dc.data.Dataset`
      dataset used for validation(optimization on valid scores)
    output_transformers: list of dc.trans.Transformer
    output_transformers: list[dc.trans.Transformer]
      transformers for evaluation
    metric: dc.metrics.Metric
      metric used for evaluation
+7 −11
Original line number Diff line number Diff line
@@ -23,20 +23,16 @@ class TestGaussianHyperparamOpt(unittest.TestCase):
    valid_dataset = dc.data.NumpyDataset(
        X=np.random.rand(20, 5), y=np.random.rand(20, 1))
    optimizer = dc.hyper.GaussianProcessHyperparamOpt(rf_model_builder)
    params_dict = {"n_estimators": 40}
    params_dict = {"n_estimators": [10, 100]}
    transformers = [
        dc.trans.NormalizationTransformer(
            transform_y=True, dataset=train_dataset)
    ]
    metric = dc.metrics.Metric(dc.metrics.r2_score)

    best_hyperparams, all_results = optimizer.hyperparam_search(
        params_dict, train_dataset, valid_dataset, transformers, metric)

    ########################################
    print("best_hyperparams")
    print(best_hyperparams)
    print("all_results")
    print(all_results)
    assert 0 == 1
    ########################################
    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
        params_dict,
        train_dataset,
        valid_dataset,
        transformers,
        metric)
+1 −2
Original line number Diff line number Diff line
"""
Tests for Grid hyperparam optimization.
Tests for hyperparam optimization.
"""
import os
import unittest
@@ -13,7 +13,6 @@ from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor



class TestGridHyperparamOpt(unittest.TestCase):
  """
  Test grid hyperparameter optimization API.