Unverified Commit af0c1a69 authored by Daiki Nishikawa's avatar Daiki Nishikawa Committed by GitHub
Browse files

Merge pull request #2227 from nd-02110114/fix-api

Fix inconsistent API for HyperparamOpt
parents 0fcc6b40 c032aaac
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -233,7 +233,7 @@ class MolecularFeaturizer(Featurizer):
  The subclasses of this class require RDKit to be installed.
  """

  def featurize(self, molecules, log_every_n=1000):
  def featurize(self, molecules, log_every_n=1000) -> np.ndarray:
    """Calculate features for molecules.

    Parameters
+16 −10
Original line number Diff line number Diff line
import logging
from typing import Any, Callable, Dict, Optional, Tuple
from typing import Any, Callable, Dict, List, Optional, Tuple

from deepchem.data import Dataset
from deepchem.trans import Transformer
from deepchem.models import Model
from deepchem.metrics import Metric

@@ -73,15 +74,15 @@ class HyperparamOpt(object):
          You probably want to instantiate a concrete subclass instead.")
    self.model_builder = model_builder

  def hyperparam_search(
      self,
      params_dict: Dict[str, Any],
  def hyperparam_search(self,
                        params_dict: Dict,
                        train_dataset: Dataset,
                        valid_dataset: Dataset,
                        metric: Metric,
                        output_transformers: List[Transformer] = [],
                        use_max: bool = True,
                        logdir: Optional[str] = None,
      **kwargs) -> Tuple[Model, Dict[str, Any], Dict[str, float]]:
                        **kwargs) -> Tuple[Model, Dict, Dict]:
    """Conduct Hyperparameter search.

    This method defines the common API shared by all hyperparameter
@@ -104,6 +105,11 @@ class HyperparamOpt(object):
      dataset used for validation(optimization on valid scores)
    metric: Metric
      metric used for evaluation
    output_transformers: list[Transformer]
      Transformers for evaluation. This argument is needed since
      `train_dataset` and `valid_dataset` may have been transformed
      for learning and need the transform to be inverted before
      the metric can be evaluated on a model.
    use_max: bool, optional
      If True, return the model with the highest score. Else return
      model with the minimum score.
+22 −17
Original line number Diff line number Diff line
@@ -7,16 +7,16 @@ import tempfile
from typing import Dict, List, Optional, Tuple, Union

from deepchem.data import Dataset
from deepchem.trans import Transformer
from deepchem.metrics import Metric
from deepchem.hyper.base_classes import HyperparamOpt
from deepchem.hyper.base_classes import _convert_hyperparam_dict_to_filename

logger = logging.getLogger(__name__)
PARAM_DICT = Dict[str, Union[int, float]]


def compute_parameter_range(params_dict: PARAM_DICT,
                            search_range: Union[int, float, PARAM_DICT]
def compute_parameter_range(params_dict: Dict,
                            search_range: Union[int, float, Dict]
                           ) -> Dict[str, Tuple[str, List[float]]]:
  """Convenience Function to compute parameter search space.

@@ -126,19 +126,18 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
  This class requires pyGPGO to be installed.
  """

  # NOTE: mypy prohibits changing the number of arguments
  # FIXME: Signature of "hyperparam_search" incompatible with supertype "HyperparamOpt"
  def hyperparam_search(  # type: ignore[override]
      self,
      params_dict: PARAM_DICT,
  def hyperparam_search(self,
                        params_dict: Dict,
                        train_dataset: Dataset,
                        valid_dataset: Dataset,
                        metric: Metric,
                        output_transformers: List[Transformer] = [],
                        use_max: bool = True,
                        logdir: Optional[str] = None,
                        max_iter: int = 20,
      search_range: Union[int, float, PARAM_DICT] = 4,
      logfile: Optional[str] = None):
                        search_range: Union[int, float, Dict] = 4,
                        logfile: Optional[str] = None,
                        **kwargs):
    """Perform hyperparameter search using a gaussian process.

    Parameters
@@ -156,6 +155,11 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
      dataset used for validation(optimization on valid scores)
    metric: Metric
      metric used for evaluation
    output_transformers: list[Transformer]
      Transformers for evaluation. This argument is needed since
      `train_dataset` and `valid_dataset` may have been transformed
      for learning and need the transform to be inverted before
      the metric can be evaluated on a model.
    use_max: bool, (default True)
      Specifies whether to maximize or minimize `metric`.
      maximization(True) or minimization(False)
@@ -280,7 +284,8 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
      except NotImplementedError:
        pass

      multitask_scores = model.evaluate(valid_dataset, [metric])
      multitask_scores = model.evaluate(valid_dataset, [metric],
                                        output_transformers)
      score = multitask_scores[metric.name]

      if log_file:
+11 −17
Original line number Diff line number Diff line
@@ -10,12 +10,11 @@ import collections
import logging
from functools import reduce
from operator import mul
from typing import cast, Dict, List, Optional
from typing import Dict, List, Optional

from deepchem.data import Dataset
from deepchem.trans import Transformer
from deepchem.metrics import Metric
from deepchem.utils.evaluate import Evaluator
from deepchem.hyper.base_classes import HyperparamOpt
from deepchem.hyper.base_classes import _convert_hyperparam_dict_to_filename

@@ -60,17 +59,16 @@ class GridHyperparamOpt(HyperparamOpt):

  """

  # NOTE: mypy prohibits changing the number of arguments
  # FIXME: Signature of "hyperparam_search" incompatible with supertype "HyperparamOpt"
  def hyperparam_search(  # type: ignore[override]
  def hyperparam_search(
      self,
      params_dict: Dict[str, List],
      params_dict: Dict,
      train_dataset: Dataset,
      valid_dataset: Dataset,
      output_transformers: List[Transformer],
      metric: Metric,
      output_transformers: List[Transformer] = [],
      use_max: bool = True,
      logdir: Optional[str] = None,
      **kwargs,
  ):
    """Perform hyperparams search according to params_dict.

@@ -86,13 +84,13 @@ class GridHyperparamOpt(HyperparamOpt):
      dataset used for training
    valid_dataset: Dataset
      dataset used for validation(optimization on valid scores)
    metric: Metric
      metric used for evaluation
    output_transformers: list[Transformer]
      Transformers for evaluation. This argument is needed since
      `train_dataset` and `valid_dataset` may have been transformed
      for learning and need the transform to be inverted before
      the metric can be evaluated on a model.
    metric: Metric
      metric used for evaluation
    use_max: bool, optional
      If True, return the model with the highest score. Else return
      model with the minimum score.
@@ -153,10 +151,8 @@ class GridHyperparamOpt(HyperparamOpt):
      except NotImplementedError:
        pass

      evaluator = Evaluator(model, valid_dataset, output_transformers)
      multitask_scores = evaluator.compute_model_performance([metric])
      # NOTE: this casting is workaround. This line doesn't effect anything to the runtime
      multitask_scores = cast(Dict[str, float], multitask_scores)
      multitask_scores = model.evaluate(valid_dataset, [metric],
                                        output_transformers)
      valid_score = multitask_scores[metric.name]
      hp_str = _convert_hyperparam_dict_to_filename(hyper_params)
      all_scores[hp_str] = valid_score
@@ -180,10 +176,8 @@ class GridHyperparamOpt(HyperparamOpt):
      # arbitrarily return last model
      best_model, best_hyperparams = model, hyperparameter_tuple
      return best_model, best_hyperparams, all_scores
    train_evaluator = Evaluator(best_model, train_dataset, output_transformers)
    multitask_scores = train_evaluator.compute_model_performance([metric])
    # NOTE: this casting is workaround. This line doesn't effect anything to the runtime
    multitask_scores = cast(Dict[str, float], multitask_scores)
    multitask_scores = best_model.evaluate(train_dataset, [metric],
                                           output_transformers)
    train_score = multitask_scores[metric.name]
    logger.info("Best hyperparameters: %s" % str(best_hyperparams))
    logger.info("train_score: %f" % train_score)
+8 −2
Original line number Diff line number Diff line
@@ -62,6 +62,7 @@ class TestGaussianHyperparamOpt(unittest.TestCase):
        self.train_dataset,
        self.valid_dataset,
        metric,
        transformers,
        use_max=False,
        max_iter=2)

@@ -82,6 +83,7 @@ class TestGaussianHyperparamOpt(unittest.TestCase):
          self.train_dataset,
          self.valid_dataset,
          metric,
          transformers,
          logdir=tmpdirname,
          max_iter=2)
    valid_score = best_model.evaluate(self.valid_dataset, [metric],
@@ -99,6 +101,7 @@ class TestGaussianHyperparamOpt(unittest.TestCase):
        np.arange(10))
    valid_dataset = dc.data.NumpyDataset(
        np.random.rand(5, 3), np.zeros((5, 2)), np.ones((5, 2)), np.arange(5))
    transformers = []

    optimizer = dc.hyper.GaussianProcessHyperparamOpt(
        lambda **params: dc.models.MultitaskRegressor(n_tasks=2,
@@ -115,10 +118,11 @@ class TestGaussianHyperparamOpt(unittest.TestCase):
        train_dataset,
        valid_dataset,
        metric,
        transformers,
        max_iter=1,
        use_max=False)

    valid_score = best_model.evaluate(valid_dataset, [metric])
    valid_score = best_model.evaluate(valid_dataset, [metric], transformers)
    assert valid_score["mean-mean_squared_error"] == min(all_results.values())
    assert valid_score["mean-mean_squared_error"] > 0

@@ -132,6 +136,7 @@ class TestGaussianHyperparamOpt(unittest.TestCase):
        np.arange(10))
    valid_dataset = dc.data.NumpyDataset(
        np.random.rand(5, 3), np.zeros((5, 2)), np.ones((5, 2)), np.arange(5))
    transformers = []

    optimizer = dc.hyper.GaussianProcessHyperparamOpt(
        lambda **params: dc.models.MultitaskRegressor(
@@ -153,11 +158,12 @@ class TestGaussianHyperparamOpt(unittest.TestCase):
          train_dataset,
          valid_dataset,
          metric,
          transformers,
          max_iter=2,
          logdir=tmpdirname,
          search_range=search_range,
          use_max=False)
      valid_score = best_model.evaluate(valid_dataset, [metric])
      valid_score = best_model.evaluate(valid_dataset, [metric], transformers)
    # Test that 2 parameters were optimized
    for hp_str in all_results.keys():
      # Recall that the key is a string of the form _batch_size_39_learning_rate_0.01 for example
Loading