Commit 5314446e authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Changes

parent 20b0a55b
Loading
Loading
Loading
Loading
+17 −28
Original line number Diff line number Diff line

def compute_parameter_search_space(params_dict, search_range):
def compute_parameter_range(params_dict, search_range):
  """Convenience Function to compute parameter search space.

  Parameters
@@ -7,7 +7,7 @@ def compute_parameter_search_space(params_dict, search_range):
  params_dict: dict
    Dictionary mapping strings to Ints/Floats/Lists. For those
    parameters in which int/float is specified, an explicit list of
    parameters is computed with `search_range`. Parameters in `hp_invalid_list`
    parameters is computed with `search_range`.
  search_range: int(float) (default 4)
    For int/float values in `params_dict`, computes optimization range
    on `[initial values / search_range, initial values *
@@ -19,14 +19,9 @@ def compute_parameter_search_space(params_dict, search_range):
    Expanded dictionary of parameters where all int/float values in
    `params_dict` are expanded out into explicit search ranges.
  """
  hyper_parameters = params_dict
  hp_list = list(hyper_parameters.keys())

  for hp in hp_invalid_list:
    if hp in hp_list:
      hp_list.remove(hp)
  hp_list = list(params_dict.keys())

  hp_list_class = [hyper_parameters[hp].__class__ for hp in hp_list]
  hp_list_class = [params_dict[hp].__class__ for hp in hp_list]
  # Check the type is correct
  if not (set(hp_list_class) <= set([list, int, float])):
    raise ValueError("params_dict must contain values that are lists/ints/floats.")
@@ -37,34 +32,35 @@ def compute_parameter_search_space(params_dict, search_range):
  ]

  # List of float or int hyper parameters(ex. layer_sizes)
  hp_list_multiple = [(hp_list[i], len(hyper_parameters[hp_list[i]]))
  hp_list_multiple = [(hp_list[i], len(params_dict[hp_list[i]]))
                      for i in range(len(hp_list))
                      if hp_list_class[i] is list]

  # Range of optimization
  param_range = []
  for hp in hp_list_single:
    if hyper_parameters[hp].__class__ is int:
    if params_dict[hp].__class__ is int:
      param_range.append((('int'), [
          hyper_parameters[hp] // search_range,
          hyper_parameters[hp] * search_range
          params_dict[hp] // search_range,
          params_dict[hp] * search_range
      ]))
    else:
      param_range.append((('cont'), [
          hyper_parameters[hp] / search_range,
          hyper_parameters[hp] * search_range
          params_dict[hp] / search_range,
          params_dict[hp] * search_range
      ]))
  for hp in hp_list_multiple:
    if hyper_parameters[hp[0]][0].__class__ is int:
    if params_dict[hp[0]][0].__class__ is int:
      param_range.extend([(('int'), [
          hyper_parameters[hp[0]][i] // search_range,
          hyper_parameters[hp[0]][i] * search_range
          params_dict[hp[0]][i] // search_range,
          params_dict[hp[0]][i] * search_range
      ]) for i in range(hp[1])])
    else:
      param_range.extend([(('cont'), [
          hyper_parameters[hp[0]][i] / search_range,
          hyper_parameters[hp[0]][i] * search_range
          params_dict[hp[0]][i] / search_range,
          params_dict[hp[0]][i] * search_range
      ]) for i in range(hp[1])])
  return hp_list_single, hp_list_multiple, param_range

class HyperparamOpt(object):
  """Abstract superclass for hyperparameter search classes.
@@ -81,14 +77,9 @@ class HyperparamOpt(object):
  strategy for searching the hyperparameter evaluation space. This
  class itself is an abstract superclass and should never be directly
  instantiated.

  Objects of this class maintains a list of constants,
  `hp_invalid_list` that contains a list of model parameters which
  cannot be optimized over This list is used to catch user errors. You
  can customize this list in the constructor.
  """

  def __init__(self, model_class, hp_invalid_list=['seed', 'nb_epoch', 'penalty_type', 'dropouts', 'bypass_dropouts', 'n_pair_feat', 'fit_transformers', 'min_child_weight', 'max_delta_step', 'subsample', 'colsample_bylevel', 'colsample_bytree', 'reg_alpha', 'reg_lambda', 'scale_pos_weight', 'base_score']):
  def __init__(self, model_class):
    """Initialize Hyperparameter Optimizer.

    Note this is an abstract constructor which should only be used by
@@ -112,14 +103,12 @@ class HyperparamOpt(object):
      must accept two arguments, `model_params` of type `dict` and
      `model_dir`, a string specifying a path to a model directory.
      See the example.
    hp_invalid_list: list, (default `['seed', 'nb_epoch', 'penalty_type', 'dropouts', 'bypass_dropouts', 'n_pair_feat', 'fit_transformers', 'min_child_weight', 'max_delta_step', 'subsample', 'colsample_bylevel', 'colsample_bytree', 'reg_alpha', 'reg_lambda', 'scale_pos_weight', 'base_score']`)
    """
    if self.__class__.__name__ == "HyperparamOpt":
      raise ValueError(
          "HyperparamOpt is an abstract superclass and cannot be directly instantiated. You probably want to instantiate a concrete subclass instead."
      )
    self.model_class = model_class
    self.hp_invalid_list = hp_invalid_list

  def hyperparam_search(self,
                        params_dict,
+49 −119
Original line number Diff line number Diff line
@@ -6,9 +6,9 @@ import numpy as np
import tempfile
import os
import deepchem
from deepchem.hyper.base_classes import compute_parameter_range
from deepchem.hyper.base_classes import HyperparamOpt
from deepchem.utils.evaluate import Evaluator
from deepchem.molnet.run_benchmark_models import benchmark_classification, benchmark_regression

logger = logging.getLogger(__name__)

@@ -34,11 +34,9 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
      metric,
      use_max=True,
      logdir=None,
      n_features=1024,
      n_tasks=1,
      max_iter=20,
      search_range=4,
      log_file='GPhypersearch.log'):
      logfile=None):
    """Perform hyperparameter search using a gaussian process.

    Parameters
@@ -58,19 +56,17 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
    logdir: str, optional
      The directory in which to store created models. If not set, will
      use a temporary directory.
    n_features: int, (default 1024)
      number of input features
    n_tasks: int, (default 1)
      number of tasks
    max_iter: int, (default 20)
      number of optimization trials
    search_range: int(float) (default 4)
      optimization on [initial values / search_range,
                       initial values * search_range]
      names of parameters that should not be optimized
    logfile: string
      name of log file, hyperparameters and results for each trial
      will be recorded
    logfile: str
      Name of logfile to write results to. If specified, this is must
      be a valid file. If not specified, results of hyperparameter
      search will be written to `logdir/.txt`.


    Returns
    -------
@@ -82,54 +78,27 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
    """
    if len(params_dict) > 20:
      raise ValueError("This class can only search over 20 parameters in one invocation.")
    #hyper_parameters = params_dict
    #hp_list = list(hyper_parameters.keys())
    #hp_list_class = [hyper_parameters[hp].__class__ for hp in hp_list]
    #assert set(hp_list_class) <= set([list, int, float])
    ## Float or int hyper parameters(ex. batch_size, learning_rate)
    #hp_list_single = [
    #    hp_list[i] for i in range(len(hp_list)) if not hp_list_class[i] is list
    #]
    ## List of float or int hyper parameters(ex. layer_sizes)
    #hp_list_multiple = [(hp_list[i], len(hyper_parameters[hp_list[i]]))
    #                    for i in range(len(hp_list))
    #                    if hp_list_class[i] is list]
    data_dir = deepchem.utils.get_data_dir()
    # Specify logfile
    if logfile:
      log_file = logfile
    elif logdir is not None:
      log_file = os.path.join(model_dir, log_file)
    else:
      log_file = None

    hyper_parameters = params_dict
    hp_list_single, hp_list_multiple, param_range = compute_parameter_range(params_dict, search_range)

    # Number of parameters
    n_param = len(hp_list_single)
    if len(hp_list_multiple) > 0:
      n_param = n_param + sum([hp[1] for hp in hp_list_multiple])
    ## Range of optimization
    #param_range = []
    #for hp in hp_list_single:
    #  if hyper_parameters[hp].__class__ is int:
    #    param_range.append((('int'), [
    #        hyper_parameters[hp] // search_range,
    #        hyper_parameters[hp] * search_range
    #    ]))
    #  else:
    #    param_range.append((('cont'), [
    #        hyper_parameters[hp] / search_range,
    #        hyper_parameters[hp] * search_range
    #    ]))
    #for hp in hp_list_multiple:
    #  if hyper_parameters[hp[0]][0].__class__ is int:
    #    param_range.extend([(('int'), [
    #        hyper_parameters[hp[0]][i] // search_range,
    #        hyper_parameters[hp[0]][i] * search_range
    #    ]) for i in range(hp[1])])
    #  else:
    #    param_range.extend([(('cont'), [
    #        hyper_parameters[hp[0]][i] / search_range,
    #        hyper_parameters[hp[0]][i] * search_range
    #    ]) for i in range(hp[1])])

    # Dummy names
    param_name = ['l' + format(i, '02d') for i in range(20)]
    param = dict(zip(param_name[:n_param], param_range))

    data_dir = deepchem.utils.get_data_dir()
    log_file = os.path.join(data_dir, log_file)

    def f(l00=0,
          l01=0,
@@ -183,33 +152,23 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
        i = i + hp[1]

      logger.info(hyper_parameters)
      if log_file:
        # Run benchmark
        with open(log_file, 'a') as f:
          # Record hyperparameters
          f.write(str(hyper_parameters))
          f.write('\n')
      if isinstance(self.model_class, str):


      if logdir is not None:
        model_dir = os.path.join(logdir, str(ind))
        logger.info("model_dir is %s" % model_dir)
        try:
          train_scores, valid_scores, _ = benchmark_classification(
              train_dataset,
              valid_dataset,
              valid_dataset, ['task_placeholder'] * n_tasks,
              transformers,
              n_features,
              metric,
              self.model_class,
              hyper_parameters=hyper_parameters)
        except AssertionError:
          train_scores, valid_scores, _ = benchmark_regression(
              train_dataset,
              valid_dataset,
              valid_dataset, ['task_placeholder'] * n_tasks,
              transformers,
              n_features,
              metric,
              self.model_class,
              hyper_parameters=hyper_parameters)
        score = valid_scores[self.model_class][metric[0].name]
          os.makedirs(model_dir)
        except OSError:
          if not os.path.isdir(model_dir):
            logger.info("Error creating model_dir, using tempfile directory")
            model_dir = tempfile.mkdtemp()
      else:
        model_dir = tempfile.mkdtemp()
      model = self.model_class(hyper_parameters, model_dir)
@@ -219,6 +178,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
      multitask_scores = evaluator.compute_model_performance([metric])
      score = multitask_scores[metric.name]

      if log_file:
        with open(log_file, 'a') as f:
          # Record performances
          f.write(str(score))
@@ -258,41 +218,11 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
      i = i + hp[1]

    # Compare best model to default hyperparameters
    if log_file:
      with open(log_file, 'a') as f:
        # Record hyperparameters
        f.write(str(params_dict))
        f.write('\n')
    if isinstance(self.model_class, str):
      try:
        train_scores, valid_scores, _ = benchmark_classification(
            train_dataset,
            valid_dataset,
            valid_dataset, ['task_placeholder'] * n_tasks,
            transformers,
            n_features,
            metric,
            self.model_class,
            hyper_parameters=params_dict)
      except AssertionError:
        train_scores, valid_scores, _ = benchmark_regression(
            train_dataset,
            valid_dataset,
            valid_dataset, ['task_placeholder'] * n_tasks,
            transformers,
            n_features,
            metric,
            self.model_class,
            hyper_parameters=params_dict)
      score = valid_scores[self.model_class][metric[0].name]
      with open(log_file, 'a') as f:
        # Record performances
        f.write(str(score))
        f.write('\n')
      if not use_max:
        score = -score
      if score > valid_performance_opt:
        # Optimized model is better, return hyperparameters
        return params_dict, score

    # Return default hyperparameters
    return hyper_parameters, valid_performance_opt
+3 −2
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@ from operator import mul
from deepchem.utils.evaluate import Evaluator
from deepchem.hyper.base_classes import HyperparamOpt

logger = logging.getLogger(__name__)

class GridHyperparamOpt(HyperparamOpt):
  """
@@ -98,8 +99,8 @@ class GridHyperparamOpt(HyperparamOpt):
            model_dir = tempfile.mkdtemp()
      else:
        model_dir = tempfile.mkdtemp()

      model = self.model_class(model_params, model_dir)
      model_params['model_dir'] = model_dir
      model = self.model_class(**model_params)
      model.fit(train_dataset)

      evaluator = Evaluator(model, valid_dataset, output_transformers)
+1 −1
Original line number Diff line number Diff line
@@ -23,7 +23,7 @@ class TestGaussianHyperparamOpt(unittest.TestCase):
    valid_dataset = dc.data.NumpyDataset(
        X=np.random.rand(20, 5), y=np.random.rand(20, 1))
    optimizer = dc.hyper.GaussianProcessHyperparamOpt(rf_model_builder)
    params_dict = {"n_estimators": [10, 100]}
    params_dict = {"n_estimators": 10}
    transformers = [
        dc.trans.NormalizationTransformer(
            transform_y=True, dataset=train_dataset)
+12 −7
Original line number Diff line number Diff line
@@ -43,8 +43,10 @@ class TestGridHyperparamOpt(unittest.TestCase):
    params_dict = {"n_estimators": [10, 100]}
    metric = dc.metrics.Metric(dc.metrics.r2_score)

    def rf_model_builder(model_params, model_dir):
      sklearn_model = RandomForestRegressor(**model_params)
    def rf_model_builder(**model_params):
      rf_params = {k:v for (k,v) in model_params.items() if k != 'model_dir'}
      model_dir = model_params['model_dir']
      sklearn_model = RandomForestRegressor(**rf_params)
      return dc.models.SklearnModel(sklearn_model, model_dir)

    optimizer = dc.hyper.GridHyperparamOpt(rf_model_builder)
@@ -91,10 +93,11 @@ class TestGridHyperparamOpt(unittest.TestCase):
        dc.metrics.matthews_corrcoef, np.mean, mode="classification")
    params_dict = {"n_estimators": [1, 10]}

    def multitask_model_builder(model_params, model_dir):

    def multitask_model_builder(**model_params):
      rf_params = {k:v for (k,v) in model_params.items() if k != 'model_dir'}
      model_dir = model_params['model_dir']
      def model_builder(model_dir):
        sklearn_model = RandomForestClassifier(**model_params)
        sklearn_model = RandomForestClassifier(**rf_params)
        return dc.models.SklearnModel(sklearn_model, model_dir)

      return dc.models.SingletaskToMultitask(tasks, model_builder, model_dir)
@@ -137,9 +140,11 @@ class TestGridHyperparamOpt(unittest.TestCase):
        dc.metrics.roc_auc_score, np.mean, mode="classification")
    params_dict = {"layer_sizes": [(10,), (100,)]}

    def model_builder(model_params, model_dir):
    def model_builder(**model_params):
      model_dir = model_params['model_dir']
      multitask_params = {k:v for (k,v) in model_params.items() if k != 'model_dir'}
      return dc.models.MultitaskClassifier(
          len(tasks), n_features, model_dir=model_dir, **model_params)
          len(tasks), n_features, model_dir=model_dir, **multitask_params)

    optimizer = dc.hyper.GridHyperparamOpt(model_builder)
    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(