Commit 2c23fc6d authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Changes

parent 255af47f
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
from deepchem.hyper.grid_search import HyperparamOpt
from deepchem.hyper.base_classes import HyperparamOpt
from deepchem.hyper.grid_search import GridHyperparamOpt
from deepchem.hyper.gaussian_process import GaussianProcessHyperparamOpt
+47 −0
Original line number Diff line number Diff line
class HyperparamOpt(object):
  """Abstract superclass for hyperparameter search classes.

  This class is an abstract base class for hyperparameter search
  classes in DeepChem. Hyperparameter search is performed on
  `dc.model.Model` classes. Each hyperparameter object accepts a
  `dc.models.Model` class upon construct. When the `hyperparam_search`
  class is invoked, this class is used to construct many different
  concrete models which are trained on the specified training set and
  evaluated on a given validation set.

  Different subclasses of `HyperparamOpt` differ in the choice of
  strategy for searching the hyperparameter evaluation space. This
  class itself is an abstract superclass and should never be directly
  instantiated.
  """

  def __init__(self, model_class):
    """Initialize Hyperparameter Optimizer.

    Note this is an abstract constructor which should only be used by
    subclasses.

    Example
    -------
    This example shows the type of constructor function expected. 

    >>> import sklearn
    >>> import deepchem as dc
    >>> def rf_model_builder(model_params, model_dir):
          sklearn_model = sklearn.ensemble.RandomForestRegressor(**model_params)
          return dc.models.SklearnModel(sklearn_model, model_dir)

    Parameters
    ----------
    model_class: constructor function.
      This parameter must be constructor function which returns an
      object which is an instance of `dc.model.Model`. This function
      must accept two arguments, `model_params` of type `dict` and
      `model_dir`, a string specifying a path to a model directory.
      See the example.
    """
    if self.__class__.__name__ == "HyperparamOpt":
      raise ValueError(
          "HyperparamOpt is an abstract superclass and cannot be directly instantiated. You probably want to instantiate a concrete subclass instead."
      )
    self.model_class = model_class
+25 −28
Original line number Diff line number Diff line
@@ -5,7 +5,8 @@ import logging
import numpy as np
import tempfile
import os
from deepchem.hyper.grid_search import HyperparamOpt
import deepchem
from deepchem.hyper.base_classes import HyperparamOpt
from deepchem.utils.evaluate import Evaluator
from deepchem.molnet.run_benchmark_models import benchmark_classification, benchmark_regression

@@ -39,10 +40,10 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
      log_file='GPhypersearch.log'):
    """Perform hyperparams search using a gaussian process assumption

    params_dict include single-valued parameters being optimized,
    which should only contain int, float and list of int(float)

    parameters with names in hp_invalid_list will not be changed.
    `params_dict` should map names of parameters being optimized to a
    list of parameter values, which should only contain int, float and
    list of int(float). Parameters with names in hp_invalid_list will
    not be changed/

    For Molnet models, self.model_class is model name in string,
    params_dict = dc.molnet.preset_hyper_parameters.hps[self.model_class]
@@ -52,29 +53,30 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
    params_dict: dict
      dict including parameters and their initial values
      parameters not suitable for optimization can be added to hp_invalid_list
    train_dataset: dc.data.Dataset struct
    train_dataset: `dc.data.Dataset`
      dataset used for training
    valid_dataset: dc.data.Dataset struct
    valid_dataset: `dc.data.Dataset`
      dataset used for validation(optimization on valid scores)
    output_transformers: list of dc.trans.Transformer
    output_transformers: list[dc.trans.Transformer]
      transformers for evaluation
    metric: list of dc.metrics.Metric
    metric: `dc.metrics.Metric`
      metric used for evaluation
    direction: bool
    direction: bool, (default True)
      maximization(True) or minimization(False)
    n_features: int
    n_features: int, (default 1024)
      number of input features
    n_tasks: int
    n_tasks: int, (default 1)
      number of tasks
    max_iter: int
    max_iter: int, (default 20)
      number of optimization trials
    search_range: int(float)
    search_range: int(float) (default 4)
      optimization on [initial values / search_range,
                       initial values * search_range]
    hp_invalid_list: list
    hp_invalid_list: list, (default `['seed', 'nb_epoch', 'penalty_type', 'dropouts', 'bypass_dropouts', 'n_pair_feat', 'fit_transformers', 'min_child_weight', 'max_delta_step', 'subsample', 'colsample_bylevel', 'colsample_bytree', 'reg_alpha', 'reg_lambda', 'scale_pos_weight', 'base_score']`)
      names of parameters that should not be optimized
    logfile: string
      name of log file, hyperparameters and results for each trial will be recorded
      name of log file, hyperparameters and results for each trial
      will be recorded

    Returns
    -------
@@ -82,10 +84,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
      params_dict with all optimized values
    valid_performance_opt: float
      best performance on valid dataset

    """

    assert len(metric) == 1, 'Only use one metric'
    hyper_parameters = params_dict
    hp_list = list(hyper_parameters.keys())
    for hp in hp_invalid_list:
@@ -136,7 +135,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
    param_name = ['l' + format(i, '02d') for i in range(20)]
    param = dict(zip(param_name[:n_param], param_range))

    data_dir = os.environ['DEEPCHEM_DATA_DIR']
    data_dir = deepchem.utils.get_data_dir()
    log_file = os.path.join(data_dir, log_file)

    def f(l00=0,
@@ -186,7 +185,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
            float(args[param_name[j]]) for j in range(i, i + hp[1])
        ]
        if param_range[i][0] == 'int':
          hyper_parameters[hp[0]] = map(int, hyper_parameters[hp[0]])
          hyper_parameters[hp[0]] = list(map(int, hyper_parameters[hp[0]]))
        i = i + hp[1]

      logger.info(hyper_parameters)
@@ -195,8 +194,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
        # Record hyperparameters
        f.write(str(hyper_parameters))
        f.write('\n')
      if isinstance(self.model_class, str) or isinstance(
          self.model_class, unicode):
      if isinstance(self.model_class, str):
        try:
          train_scores, valid_scores, _ = benchmark_classification(
              train_dataset,
@@ -224,8 +222,8 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
        model.fit(train_dataset, **hyper_parameters)
        model.save()
        evaluator = Evaluator(model, valid_dataset, output_transformers)
        multitask_scores = evaluator.compute_model_performance(metric)
        score = multitask_scores[metric[0].name]
        multitask_scores = evaluator.compute_model_performance([metric])
        score = multitask_scores[metric.name]

      with open(log_file, 'a') as f:
        # Record performances
@@ -262,7 +260,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
          float(hp_opt[param_name[j]]) for j in range(i, i + hp[1])
      ]
      if param_range[i][0] == 'int':
        hyper_parameters[hp[0]] = map(int, hyper_parameters[hp[0]])
        hyper_parameters[hp[0]] = list(map(int, hyper_parameters[hp[0]]))
      i = i + hp[1]

    # Compare best model to default hyperparameters
@@ -270,8 +268,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
      # Record hyperparameters
      f.write(str(params_dict))
      f.write('\n')
    if isinstance(self.model_class, str) or isinstance(self.model_class,
                                                       unicode):
    if isinstance(self.model_class, str):
      try:
        train_scores, valid_scores, _ = benchmark_classification(
            train_dataset,
+48 −29
Original line number Diff line number Diff line
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Contains basic hyperparameter optimizations.
"""
@@ -9,23 +7,23 @@ import itertools
import tempfile
import shutil
import collections
import logging
from functools import reduce
from operator import mul
from deepchem.utils.evaluate import Evaluator
from deepchem.utils.save import log
from deepchem.hyper.base_classes import HyperparamOpt


class HyperparamOpt(object):
  """
  Provides simple hyperparameter search capabilities.
class GridHyperparamOpt(HyperparamOpt):
  """
  Provides simple grid hyperparameter search capabilities.

  def __init__(self, model_class, verbose=True):
    self.model_class = model_class
    self.verbose = verbose
  This class performs a grid hyperparameter search over the specified
  hyperparameter space. This implementation is simple and simply does
  a direct iteration over all possible hyperparameters and doesn't use
  parallelization to speed up the search.
  """

  # TODO(rbharath): This function is complicated and monolithic. Is there a nice
  # way to refactor this?
  def hyperparam_search(self,
                        params_dict,
                        train_dataset,
@@ -36,10 +34,35 @@ class HyperparamOpt(object):
                        logdir=None):
    """Perform hyperparams search according to params_dict.

    Each key to hyperparams_dict is a model_param. The values should be a list
    of potential values for that hyperparam.
    Each key to hyperparams_dict is a model_param. The values should
    be a list of potential values for that hyperparam.

    Parameters
    ----------
    params_dict: dict
      dict including parameters and their initial values.
    train_dataset: `dc.data.Dataset`
      dataset used for training
    valid_dataset: `dc.data.Dataset`
      dataset used for validation(optimization on valid scores)
    output_transformers: list of dc.trans.Transformer
      transformers for evaluation
    metric: dc.metrics.Metric
      metric used for evaluation
    use_max: bool, optional
      If True, return the model with the highest score. Else return
      model with the minimum score.
    logdir: str, optional
      The directory in which to store created models. If not set, will
      use a temporary directory.

    TODO(rbharath): This shouldn't be stored in a temporary directory.
    Returns
    -------
    `(best_model, best_hyperparams, all_scores)` where `best_model` is
    an instance of `dc.model.Models`, `best_hyperparams` is a
    dictionary of parameters, and `all_scores` is a dictionary mapping
    string representations of hyperparameter sets to validation
    scores.
    """
    hyperparams = params_dict.keys()
    hyperparam_vals = params_dict.values()
@@ -58,20 +81,19 @@ class HyperparamOpt(object):
    for ind, hyperparameter_tuple in enumerate(
        itertools.product(*hyperparam_vals)):
      model_params = {}
      log("Fitting model %d/%d" % (ind + 1, number_combinations), self.verbose)
      logger.info("Fitting model %d/%d" % (ind + 1, number_combinations))
      for hyperparam, hyperparam_val in zip(hyperparams, hyperparameter_tuple):
        model_params[hyperparam] = hyperparam_val
      log("hyperparameters: %s" % str(model_params), self.verbose)
      logger.info("hyperparameters: %s" % str(model_params))

      if logdir is not None:
        model_dir = os.path.join(logdir, str(ind))
        log("model_dir is %s" % model_dir, self.verbose)
        logger.info("model_dir is %s" % model_dir)
        try:
          os.makedirs(model_dir)
        except OSError:
          if not os.path.isdir(model_dir):
            log("Error creating model_dir, using tempfile directory",
                self.verbose)
            logger.info("Error creating model_dir, using tempfile directory")
            model_dir = tempfile.mkdtemp()
      else:
        model_dir = tempfile.mkdtemp()
@@ -95,21 +117,18 @@ class HyperparamOpt(object):
      else:
        shutil.rmtree(model_dir)

      log(
          "Model %d/%d, Metric %s, Validation set %s: %f" %
          (ind + 1, number_combinations, metric.name, ind, valid_score),
          self.verbose)
      log("\tbest_validation_score so far: %f" % best_validation_score,
          self.verbose)
      logger.info("Model %d/%d, Metric %s, Validation set %s: %f" %
                  (ind + 1, number_combinations, metric.name, ind, valid_score))
      logger.info("\tbest_validation_score so far: %f" % best_validation_score)
    if best_model is None:
      log("No models trained correctly.", self.verbose)
      logger.info("No models trained correctly.")
      # arbitrarily return last model
      best_model, best_hyperparams = model, hyperparameter_tuple
      return best_model, best_hyperparams, all_scores
    train_evaluator = Evaluator(best_model, train_dataset, output_transformers)
    multitask_scores = train_evaluator.compute_model_performance([metric])
    train_score = multitask_scores[metric.name]
    log("Best hyperparameters: %s" % str(best_hyperparams), self.verbose)
    log("train_score: %f" % train_score, self.verbose)
    log("validation_score: %f" % best_validation_score, self.verbose)
    logger.info("Best hyperparameters: %s" % str(best_hyperparams))
    logger.info("train_score: %f" % train_score)
    logger.info("validation_score: %f" % best_validation_score)
    return best_model, best_hyperparams, all_scores
+42 −0
Original line number Diff line number Diff line
"""
Tests for Gaussian Process Hyperparameter Optimization.
"""
import numpy as np
import sklearn
import deepchem as dc
import unittest


class TestGaussianHyperparamOpt(unittest.TestCase):
  """
  Test Gaussian Hyperparameter Optimization.
  """

  def test_rf_example(self):

    def rf_model_builder(model_params, model_dir):
      sklearn_model = sklearn.ensemble.RandomForestRegressor(**model_params)
      return dc.models.SklearnModel(sklearn_model, model_dir)

    train_dataset = dc.data.NumpyDataset(
        X=np.random.rand(50, 5), y=np.random.rand(50, 1))
    valid_dataset = dc.data.NumpyDataset(
        X=np.random.rand(20, 5), y=np.random.rand(20, 1))
    optimizer = dc.hyper.GaussianProcessHyperparamOpt(rf_model_builder)
    params_dict = {"n_estimators": 40}
    transformers = [
        dc.trans.NormalizationTransformer(
            transform_y=True, dataset=train_dataset)
    ]
    metric = dc.metrics.Metric(dc.metrics.r2_score)

    best_hyperparams, all_results = optimizer.hyperparam_search(
        params_dict, train_dataset, valid_dataset, transformers, metric)

    ########################################
    print("best_hyperparams")
    print(best_hyperparams)
    print("all_results")
    print(all_results)
    assert 0 == 1
    ########################################
Loading