Commit 2ef0e4b7 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

changes

parent ef070999
Loading
Loading
Loading
Loading
+4 −14
Original line number Diff line number Diff line
@@ -15,27 +15,17 @@ class HyperparamOpt(object):
  instantiated.
  """

  def __init__(self, model_class):
  def __init__(self, model_builder):
    """Initialize Hyperparameter Optimizer.

    Note this is an abstract constructor which should only be used by
    subclasses.

    Example
    -------
    This example shows the type of constructor function expected. 

    >>> import sklearn
    >>> import deepchem as dc
    >>> def rf_model_builder(model_params, model_dir):
    ...   sklearn_model = sklearn.ensemble.RandomForestRegressor(**model_params)
    ...   return dc.models.SklearnModel(sklearn_model, model_dir)

    Parameters
    ----------
    model_class: constructor function.
    model_builder: constructor function.
      This parameter must be constructor function which returns an
      object which is an instance of `dc.model.Model`. This function
      object which is an instance of `dc.models.Model`. This function
      must accept two arguments, `model_params` of type `dict` and
      `model_dir`, a string specifying a path to a model directory.
      See the example.
@@ -44,7 +34,7 @@ class HyperparamOpt(object):
      raise ValueError(
          "HyperparamOpt is an abstract superclass and cannot be directly instantiated. You probably want to instantiate a concrete subclass instead."
      )
    self.model_class = model_class
    self.model_builder = model_builder

  def hyperparam_search(self,
                        params_dict,
+32 −12
Original line number Diff line number Diff line
@@ -86,9 +86,19 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
  `GridHyperparamOpt`. `param_dict[hp]` must be an int/float and is
  used as the center of a search range.

  Note
  ----
  This class can only optimize 20 parameters at a time.
  Example
  -------
  This example shows the type of constructor function expected. 

  >>> import sklearn
  >>> import deepchem as dc
  >>> def rf_model_builder(**model_params):
  ...   rf_params = {k: v for (k, v) in model_params.items() if k != 'model_dir'}
  ...   model_dir = model_params['model_dir']
  ...   sklearn_model = sklearn.ensemble.RandomForestRegressor(**rf_params)
  ...   return dc.models.SklearnModel(sklearn_model, model_dir)
  >>> optimizer = dc.hyper.GaussianProcessHyperparamOpt(rf_model_builder)

  """

  def hyperparam_search(self,
@@ -149,7 +159,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
    if logfile:
      log_file = logfile
    elif logdir is not None:
      log_file = os.path.join(logdir, log_file)
      log_file = os.path.join(logdir, "results.txt")
    else:
      log_file = None

@@ -159,19 +169,20 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):

    # Stores all results
    all_results = {}
    # Stores all model locations
    model_locations = {}

    # Demarcating internal function for readability
    ########################
    def f(**placeholders):
    def optimizing_function(**placeholders):
      """Private Optimizing function

      Take in hyper parameter values and return valid set performances

      Parameters
      ----------
      l00~l19: int or float
        placeholders for hyperparameters being optimized,
        hyper_parameters dict is rebuilt based on input values of placeholders
      placeholders: keyword arguments
        Should be various hyperparameters as specified in `param_keys` above.

      Returns:
      --------
@@ -209,7 +220,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
        model_dir = tempfile.mkdtemp()
      # Add it on to the information needed for the constructor
      hyper_parameters["model_dir"] = model_dir
      model = self.model_class(**hyper_parameters)
      model = self.model_builder(**hyper_parameters)
      model.fit(train_dataset)
      try:
        model.save()
@@ -228,6 +239,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
          f.write('\n')
      # Store all results
      all_results[hp_str] = score
      model_locations[hp_str] = model_dir
      # GPGO maximize performance by default, set performance to its negative value for minimization
      if use_max:
        return score
@@ -244,7 +256,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
    cov = matern32()
    gp = GaussianProcess(cov)
    acq = Acquisition(mode='ExpectedImprovement')
    gpgo = GPGO(gp, acq, f, param_range)
    gpgo = GPGO(gp, acq, optimizing_function, param_range)
    logger.info("Max number of iteration: %i" % max_iter)
    gpgo.run(max_iter=max_iter)

@@ -256,9 +268,17 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
      else:
        hyper_parameters[hp] = float(hp_opt[hp])
    hp_str = _convert_hyperparam_dict_to_filename(hyper_parameters)
    model_dir = "model%s" % hp_str

    # Let's reinitialize the model with the best parameters
    model_dir = model_locations[hp_str]
    hyper_parameters["model_dir"] = model_dir
    best_model = self.model_class(**hyper_parameters)
    best_model = self.model_builder(**hyper_parameters)
    # Some models need to be explicitly reloaded
    try:
      best_model.restore()
    # Some models auto reload
    except NotImplementedError:
      pass

    # Compare best model to default hyperparameters
    if log_file:
+15 −1
Original line number Diff line number Diff line
@@ -24,6 +24,20 @@ class GridHyperparamOpt(HyperparamOpt):
  hyperparameter space. This implementation is simple and simply does
  a direct iteration over all possible hyperparameters and doesn't use
  parallelization to speed up the search.

  Example
  -------
  This example shows the type of constructor function expected. 

  >>> import sklearn
  >>> import deepchem as dc
  >>> def rf_model_builder(**model_params):
  ...   rf_params = {k: v for (k, v) in model_params.items() if k != 'model_dir'}
  ...   model_dir = model_params['model_dir']
  ...   sklearn_model = sklearn.ensemble.RandomForestRegressor(**rf_params)
  ...   return dc.models.SklearnModel(sklearn_model, model_dir)
  >>> optimizer = dc.hyper.GridHyperparamOpt(rf_model_builder)

  """

  def hyperparam_search(self,
@@ -101,7 +115,7 @@ class GridHyperparamOpt(HyperparamOpt):
      else:
        model_dir = tempfile.mkdtemp()
      model_params['model_dir'] = model_dir
      model = self.model_class(**model_params)
      model = self.model_builder(**model_params)
      model.fit(train_dataset)
      try:
        model.save()
+44 −6
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@ import numpy as np
import sklearn
import deepchem as dc
import unittest
import tempfile


class TestGaussianHyperparamOpt(unittest.TestCase):
@@ -12,7 +13,8 @@ class TestGaussianHyperparamOpt(unittest.TestCase):
  Test Gaussian Hyperparameter Optimization.
  """

  def test_rf_example(self):
  def setUp(self):
    """Set up common resources."""

    def rf_model_builder(**model_params):
      rf_params = {k: v for (k, v) in model_params.items() if k != 'model_dir'}
@@ -20,17 +22,53 @@ class TestGaussianHyperparamOpt(unittest.TestCase):
      sklearn_model = sklearn.ensemble.RandomForestRegressor(**rf_params)
      return dc.models.SklearnModel(sklearn_model, model_dir)

    train_dataset = dc.data.NumpyDataset(
    self.rf_model_builder = rf_model_builder
    self.train_dataset = dc.data.NumpyDataset(
        X=np.random.rand(50, 5), y=np.random.rand(50, 1))
    valid_dataset = dc.data.NumpyDataset(
    self.valid_dataset = dc.data.NumpyDataset(
        X=np.random.rand(20, 5), y=np.random.rand(20, 1))
    optimizer = dc.hyper.GaussianProcessHyperparamOpt(rf_model_builder)

  def test_rf_example(self):
    """Test a simple example of optimizing a RF model with a gaussian process."""

    optimizer = dc.hyper.GaussianProcessHyperparamOpt(self.rf_model_builder)
    params_dict = {"n_estimators": 10}
    transformers = [
        dc.trans.NormalizationTransformer(
            transform_y=True, dataset=train_dataset)
            transform_y=True, dataset=self.train_dataset)
    ]
    metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)

    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
        params_dict, train_dataset, valid_dataset, transformers, metric)
        params_dict,
        self.train_dataset,
        self.valid_dataset,
        transformers,
        metric,
        max_iter=2)

    valid_score = best_model.evaluate(self.valid_dataset, [metric],
                                      transformers)
    assert valid_score["pearson_r2_score"] > 0

  def test_rf_with_logdir(self):
    """Test that using a logdir can work correctly."""
    optimizer = dc.hyper.GaussianProcessHyperparamOpt(self.rf_model_builder)
    params_dict = {"n_estimators": 10}
    transformers = [
        dc.trans.NormalizationTransformer(
            transform_y=True, dataset=self.train_dataset)
    ]
    metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
    with tempfile.TemporaryDirectory() as tmpdirname:
      best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
          params_dict,
          self.train_dataset,
          self.valid_dataset,
          transformers,
          metric,
          logdir=tmpdirname,
          max_iter=2)
    valid_score = best_model.evaluate(self.valid_dataset, [metric],
                                      transformers)
    assert valid_score["pearson_r2_score"] > 0
+1 −1
Original line number Diff line number Diff line
@@ -77,7 +77,7 @@ class Model(BaseEstimator):
    raise NotImplementedError(
        "Each model is responsible for its own predict_on_batch method.")

  def reload(self):
  def restore(self):
    """
    Reload trained model from disk.
    """
Loading