Commit ecd815fd authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Changes

parent 5ecedefb
Loading
Loading
Loading
Loading
+32 −0
Original line number Diff line number Diff line
import logging

logger = logging.getLogger(__name__)


def _convert_hyperparam_dict_to_filename(hyper_params):
  """Helper function that converts a dictionary of hyperparameters to a string that can be a filename.

  Parameters
  ----------
  hyper_params: dict
    Maps string of hyperparameter name to int/float.

  Returns
  -------
  filename: str
    A filename of form "_key1_value1_value2_..._key2..."
  """
  filename = ""
  keys = sorted(hyper_params.keys())
  for key in keys:
    filename += "_%s" % str(key)
    value = hyper_params[key]
    if isinstance(value, int):
      filename += "_%s" % str(value)
    elif isinstance(value, float):
      filename += "_%.2f" % value
    else:
      filename += "%s" % str(value)
  return filename


class HyperparamOpt(object):
  """Abstract superclass for hyperparameter search classes.

+22 −34
Original line number Diff line number Diff line
@@ -8,39 +8,11 @@ import os
import deepchem
from deepchem.hyper.base_classes import HyperparamOpt
from deepchem.utils.evaluate import Evaluator
from deepchem.hyper.base_classes import _convert_hyperparam_dict_to_filename

logger = logging.getLogger(__name__)


def _convert_hyperparam_dict_to_filename(hyper_params):
  """Helper function that converts a dictionary of hyperparameters to a string that can be a filename.

  Parameters
  ----------
  hyper_params: dict
    Maps string of hyperparameter name to int/float.

  Returns
  -------
  filename: str
    A filename of form "_key1_value1_value2_..._key2..."
  """
  filename = ""
  keys = sorted(hyper_params.keys())
  for key in keys:
    filename += "_%s" % str(key)
    value = hyper_params[key]
    if isinstance(value, int):
      filename += "_%s" % str(value)
    elif isinstance(value, float):
      filename += "_%.2f" % value
    else:
      raise ValueError(
          "Hyperparameters to search must be specified as ints/floats since GaussianProcessHyperparamOpt searches over a range of numbers around the specified point."
      )
  return filename


def compute_parameter_range(params_dict, search_range):
  """Convenience Function to compute parameter search space.

@@ -82,15 +54,26 @@ def compute_parameter_range(params_dict, search_range):
  """
  # Range of optimization
  param_range = {}
  if isinstance(search_range, dict):
    if sorted(params_dict.keys()) != sorted(search_range.keys()):
      raise ValueError(
          "If search_range is provided as a dictionary, it must have the same keys as params_dict."
      )
  elif (not isinstance(search_range, int)) and (not isinstance(
      search_range, float)):
    raise ValueError("search_range must be a dict or int or float.")
  for hp, value in params_dict.items():
    if isinstance(search_range, dict):
      hp_search_range = search_range[hp]
    else:
      # We know from guard above that this is an int/float
      hp_search_range = search_range
    if isinstance(value, int):
      value_range = [value // search_range, value * search_range]
      value_range = [value // hp_search_range, value * hp_search_range]
      param_range[hp] = ("int", value_range)
      pass
    elif isinstance(value, float):
      value_range = [value / search_range, value * search_range]
      value_range = [value / hp_search_range, value * hp_search_range]
      param_range[hp] = ("cont", value_range)
      pass
  return param_range


@@ -239,6 +222,10 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
      valid_scores: float
        valid set performances
      """
      ############################
      print("placeholders: %s" % str(placeholders))
      print("param_range: %s" % str(param_range))
      ############################
      hyper_parameters = {}
      for hp in param_keys:
        if param_range[hp][0] == "int":
@@ -335,6 +322,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
    if log_file:
      with open(log_file, 'a') as f:
        # Record hyperparameters
        f.write("params_dict:")
        f.write(str(params_dict))
        f.write('\n')

+5 −1
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@ from functools import reduce
from operator import mul
from deepchem.utils.evaluate import Evaluator
from deepchem.hyper.base_classes import HyperparamOpt
from deepchem.hyper.base_classes import _convert_hyperparam_dict_to_filename

logger = logging.getLogger(__name__)

@@ -94,6 +95,8 @@ class GridHyperparamOpt(HyperparamOpt):
        itertools.product(*hyperparam_vals)):
      model_params = {}
      logger.info("Fitting model %d/%d" % (ind + 1, number_combinations))
      # Construction dictionary mapping hyperparameter names to values
      hyper_params = dict(zip(hyperparams, hyperparameter_tuple))
      for hyperparam, hyperparam_val in zip(hyperparams, hyperparameter_tuple):
        model_params[hyperparam] = hyperparam_val
      logger.info("hyperparameters: %s" % str(model_params))
@@ -121,7 +124,8 @@ class GridHyperparamOpt(HyperparamOpt):
      evaluator = Evaluator(model, valid_dataset, output_transformers)
      multitask_scores = evaluator.compute_model_performance([metric])
      valid_score = multitask_scores[metric.name]
      all_scores[str(hyperparameter_tuple)] = valid_score
      hp_str = _convert_hyperparam_dict_to_filename(hyper_params)
      all_scores[hp_str] = valid_score

      if (use_max and valid_score >= best_validation_score) or (
          not use_max and valid_score <= best_validation_score):
+55 −1
Original line number Diff line number Diff line
"""
Tests for Gaussian Process Hyperparameter Optimization.

These tests fails every so often. I think it's when the Gaussian
process optimizer doesn't find an optimal point. This is still a
valuable test suite so leaving it in despite the flakiness.
"""
import os
import numpy as np
import sklearn
import deepchem as dc
import unittest
import tempfile
from flaky import flaky


class TestGaussianHyperparamOpt(unittest.TestCase):
@@ -122,8 +128,9 @@ class TestGaussianHyperparamOpt(unittest.TestCase):
    scores = model.evaluate(dataset, [regression_metric])
    assert scores[regression_metric.name] < .1

  @flaky
  def test_multitask_example(self):
    """Test a simple example of optimizing a multitask model with a grid search."""
    """Test a simple example of optimizing a multitask model with a gaussian process search."""
    # Generate dummy dataset
    np.random.seed(123)
    train_dataset = dc.data.NumpyDataset(
@@ -155,3 +162,50 @@ class TestGaussianHyperparamOpt(unittest.TestCase):
    valid_score = best_model.evaluate(valid_dataset, [metric])
    assert valid_score["mean-mean_squared_error"] == min(all_results.values())
    assert valid_score["mean-mean_squared_error"] > 0

  @flaky
  def test_multitask_example_different_search_range(self):
    """Test a simple example of optimizing a multitask model with a gaussian process search with per-parameter search range."""
    # Generate dummy dataset
    np.random.seed(123)
    train_dataset = dc.data.NumpyDataset(
        np.random.rand(10, 3), np.zeros((10, 2)), np.ones((10, 2)),
        np.arange(10))
    valid_dataset = dc.data.NumpyDataset(
        np.random.rand(5, 3), np.zeros((5, 2)), np.ones((5, 2)), np.arange(5))

    optimizer = dc.hyper.GaussianProcessHyperparamOpt(
        lambda **p: dc.models.MultitaskRegressor(
            n_tasks=2,
            n_features=3,
            dropouts=[0.],
            weight_init_stddevs=[np.sqrt(6) / np.sqrt(1000)],
            #learning_rate=0.003, **p))
            **p))

    params_dict = {"learning_rate": 0.003, "batch_size": 10}
    # These are per-example multiplier
    search_range = {"learning_rate": 10, "batch_size": 4}
    transformers = []
    metric = dc.metrics.Metric(
        dc.metrics.mean_squared_error, task_averager=np.mean)

    with tempfile.TemporaryDirectory() as tmpdirname:
      best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
          params_dict,
          train_dataset,
          valid_dataset,
          transformers,
          metric,
          max_iter=2,
          logdir=tmpdirname,
          search_range=search_range,
          use_max=False)
      valid_score = best_model.evaluate(valid_dataset, [metric])
    # Test that 2 parameters were optimized
    for hp_str in all_results.keys():
      # Recall that the key is a string of the form _batch_size_39_learning_rate_0.01 for example
      assert "batch_size" in hp_str
      assert "learning_rate" in hp_str
    assert valid_score["mean-mean_squared_error"] == min(all_results.values())
    assert valid_score["mean-mean_squared_error"] > 0
+44 −0
Original line number Diff line number Diff line
@@ -120,3 +120,47 @@ class TestGridHyperparamOpt(unittest.TestCase):
    valid_score = best_model.evaluate(valid_dataset, [metric])
    assert valid_score["mean-mean_squared_error"] == min(all_results.values())
    assert valid_score["mean-mean_squared_error"] > 0

  def test_multitask_example_multiple_params(self):
    """Test a simple example of optimizing a multitask model with a grid search with multiple parameters to optimize."""
    # Generate dummy dataset
    np.random.seed(123)
    train_dataset = dc.data.NumpyDataset(
        np.random.rand(10, 3), np.zeros((10, 2)), np.ones((10, 2)),
        np.arange(10))
    valid_dataset = dc.data.NumpyDataset(
        np.random.rand(5, 3), np.zeros((5, 2)), np.ones((5, 2)), np.arange(5))

    optimizer = dc.hyper.GridHyperparamOpt(
        lambda **p: dc.models.MultitaskRegressor(
            n_tasks=2,
            n_features=3,
            dropouts=[0.],
            weight_init_stddevs=[np.sqrt(6) / np.sqrt(1000)],
            #learning_rate=0.003, **p))
            **p))

    params_dict = {"learning_rate": [0.003, 0.03], "batch_size": [10, 50]}
    # These are per-example multiplier
    transformers = []
    metric = dc.metrics.Metric(
        dc.metrics.mean_squared_error, task_averager=np.mean)

    with tempfile.TemporaryDirectory() as tmpdirname:
      best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
          params_dict,
          train_dataset,
          valid_dataset,
          transformers,
          metric,
          logdir=tmpdirname,
          use_max=False)
      valid_score = best_model.evaluate(valid_dataset, [metric])
    # Test that 2 parameters were optimized
    for hp_str in all_results.keys():
      # Recall that the key is a string of the form _batch_size_39_learning_rate_0.01 for example
      assert "batch_size" in hp_str
      assert "learning_rate" in hp_str

    assert valid_score["mean-mean_squared_error"] == min(all_results.values())
    assert valid_score["mean-mean_squared_error"] > 0