:recycle: add typing (628d726e) · Commits · 钟慕尧 / deepchem

deepchem/dock/pose_generation.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -298,6 +298,7 @@ class VinaPoseGenerator(PoseGenerator):
		else:
		# I'm not sure why specifying the args as a list fails on other platforms,
		# but for some reason it only works if I pass it as a string.
		# FIXME: Incompatible types in assignment
		args = "%s --config %s --log %s --out %s" % ( # type: ignore
		self.vina_cmd, conf_file, log_file, out_pdbqt)
		# FIXME: We should use `subprocess.run` instead of `call`

deepchem/hyper/base_classes.py

+23 −20

Original line number	Diff line number	Diff line
		import logging
		from typing import Any, Callable, Dict, Optional, Tuple

		from deepchem.data import Dataset
		from deepchem.models import Model
		from deepchem.metrics import Metric

		logger = logging.getLogger(__name__)


		def _convert_hyperparam_dict_to_filename(hyper_params):
		def _convert_hyperparam_dict_to_filename(hyper_params: Dict) -> str:
		"""Helper function that converts a dictionary of hyperparameters to a string that can be a filename.

		Parameters
		----------
		hyper_params: dict
		Maps string of hyperparameter name to int/float.
		hyper_params: Dict
		Maps string of hyperparameter name to int/float/string/list etc.

		Returns
		-------
		@@ -47,7 +52,7 @@ class HyperparamOpt(object):
		instantiated.
		"""

		def __init__(self, model_builder):
		def __init__(self, model_builder: Callable[..., Model]):
		"""Initialize Hyperparameter Optimizer.

		Note this is an abstract constructor which should only be used by
		@@ -64,18 +69,19 @@ class HyperparamOpt(object):
		"""
		if self.__class__.__name__ == "HyperparamOpt":
		raise ValueError(
		"HyperparamOpt is an abstract superclass and cannot be directly instantiated. You probably want to instantiate a concrete subclass instead."
		)
		"HyperparamOpt is an abstract superclass and cannot be directly instantiated. \
		You probably want to instantiate a concrete subclass instead.")
		self.model_builder = model_builder

		def hyperparam_search(self,
		params_dict,
		train_dataset,
		valid_dataset,
		transformers,
		metric,
		use_max=True,
		logdir=None):
		def hyperparam_search(
		self,
		params_dict: Dict[str, Any],
		train_dataset: Dataset,
		valid_dataset: Dataset,
		metric: Metric,
		use_max: bool = True,
		logdir: Optional[str] = None,
		**kwargs) -> Tuple[Model, Dict[str, Any], Dict[str, float]]:
		"""Conduct Hyperparameter search.

		This method defines the common API shared by all hyperparameter
		@@ -84,7 +90,7 @@ class HyperparamOpt(object):

		Parameters
		----------
		params_dict: dict
		params_dict: Dict
		Dictionary mapping strings to values. Note that the
		precise semantics of `params_dict` will change depending on the
		optimizer that you're using. Depending on the type of
		@@ -96,11 +102,8 @@ class HyperparamOpt(object):
		dataset used for training
		valid_dataset: `dc.data.Dataset`
		dataset used for validation(optimization on valid scores)
		output_transformers: list[dc.trans.Transformer]
		Transformers for evaluation. This argument is needed since
		`train_dataset` and `valid_dataset` may have been transformed
		for learning and need the transform to be inverted before
		the metric can be evaluated on a model.
		metric: `dc.metrics.Metric`
		metric used for evaluation
		use_max: bool, optional
		If True, return the model with the highest score. Else return
		model with the minimum score.

deepchem/hyper/gaussian_process.py

+43 −33

Original line number	Diff line number	Diff line
		@@ -4,23 +4,30 @@ Contains class for gaussian process hyperparameter optimizations.
		import os
		import logging
		import tempfile
		from typing import Dict, List, Optional, Tuple, Union

		from deepchem.data import Dataset
		from deepchem.metrics import Metric
		from deepchem.hyper.base_classes import HyperparamOpt
		from deepchem.hyper.base_classes import _convert_hyperparam_dict_to_filename

		logger = logging.getLogger(__name__)


		def compute_parameter_range(params_dict, search_range):
		def compute_parameter_range(
		params_dict: Dict[str, Union[int, float]],
		search_range: Union[int, float, Dict[str, Union[int, float]]]
		) -> Dict[str, Tuple[str, List[float]]]:
		"""Convenience Function to compute parameter search space.

		Parameters
		----------
		params_dict: dict
		params_dict: Dict
		Dictionary mapping strings to Ints/Floats. An explicit list of
		parameters is computed with `search_range`. The optimization range
		computed is specified in the documentation for `search_range`
		below.
		search_range: int(float)/dict (default 4)
		search_range: int/float/Dict (default 4)
		The `search_range` specifies the range of parameter values to
		search for. If `search_range` is an int/float, it is used as the
		global search range for parameters. This creates a search
		@@ -41,7 +48,7 @@ def compute_parameter_range(params_dict, search_range):

		Returns
		-------
		param_range: dict
		param_range: Dict
		Dictionary mapping hyperparameter names to tuples. Each tuple is
		of form `(value_type, value_range)` where `value_type` is a string
		that is either "int" or "cont" and `value_range` is a list of two
		@@ -115,22 +122,24 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
		>>> optimizer = dc.hyper.GaussianProcessHyperparamOpt(model_builder)
		"""

		def hyperparam_search(self,
		params_dict,
		train_dataset,
		valid_dataset,
		transformers,
		metric,
		use_max=True,
		logdir=None,
		max_iter=20,
		search_range=4,
		logfile=None):
		# NOTE: mypy prohibits changing the number of arguments
		# FIXME: Signature of "hyperparam_search" incompatible with supertype "HyperparamOpt"
		def hyperparam_search( # type: ignore[override]
		self,
		params_dict: Dict[str, Union[int, float]],
		train_dataset: Dataset,
		valid_dataset: Dataset,
		metric: Metric,
		use_max: bool = True,
		logdir: Optional[str] = None,
		max_iter: int = 20,
		search_range: Union[int, float, Dict[str, Union[int, float]]] = 4,
		logfile: Optional[str] = None):
		"""Perform hyperparameter search using a gaussian process.

		Parameters
		----------
		params_dict: dict
		params_dict: Dict
		Maps hyperparameter names (strings) to possible parameter
		values. The semantics of this list are different than for
		`GridHyperparamOpt`. `params_dict[hp]` must map to an int/float,
		@@ -141,19 +150,17 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
		dataset used for training
		valid_dataset: `dc.data.Dataset`
		dataset used for validation(optimization on valid scores)
		transformers: list[dc.trans.Transformer]
		transformers for evaluation
		metric: `dc.metrics.Metric`
		metric used for evaluation
		use_max: bool, (default True)
		Specifies whether to maximize or minimize `metric`.
		maximization(True) or minimization(False)
		logdir: str, optional
		logdir: str, optional, (default None)
		The directory in which to store created models. If not set, will
		use a temporary directory.
		max_iter: int, (default 20)
		number of optimization trials
		search_range: int(float)/dict (default 4)
		search_range: int/float/Dict (default 4)
		The `search_range` specifies the range of parameter values to
		search for. If `search_range` is an int/float, it is used as the
		global search range for parameters. This creates a search
		@@ -171,7 +178,7 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):

		optimization on hp on [initial value[hp] / search_range[hp],
		initial value[hp] * search_range[hp]]
		logfile: str
		logfile: str, optional (default None)
		Name of logfile to write results to. If specified, this is must
		be a valid file. If not specified, results of hyperparameter
		search will be written to `logdir/.txt`.
		@@ -180,12 +187,21 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
		Returns
		-------
		`(best_model, best_hyperparams, all_scores)` where `best_model` is
		an instance of `dc.model.Models`, `best_hyperparams` is a
		an instance of `dc.model.Model`, `best_hyperparams` is a
		dictionary of parameters, and `all_scores` is a dictionary mapping
		string representations of hyperparameter sets to validation
		scores.
		"""
		try:
		from pyGPGO.covfunc import matern32
		from pyGPGO.acquisition import Acquisition
		from pyGPGO.surrogates.GaussianProcess import GaussianProcess
		from pyGPGO.GPGO import GPGO
		except ModuleNotFoundError:
		raise ValueError("This class requires pyGPGO to be installed.")

		# Specify logfile
		log_file = None
		if logfile:
		log_file = logfile
		elif logdir is not None:
		@@ -193,8 +209,6 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
		if not os.path.exists(logdir):
		os.makedirs(logdir, exist_ok=True)
		log_file = os.path.join(logdir, "results.txt")
		else:
		log_file = None

		# setup range
		param_range = compute_parameter_range(params_dict, search_range)
		@@ -208,7 +222,6 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
		model_locations = {}

		# Demarcating internal function for readability
		########################
		def optimizing_function(**placeholders):
		"""Private Optimizing function

		@@ -275,18 +288,14 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
		# Store reference to model
		all_models[hp_str] = model
		model_locations[hp_str] = model_dir
		# GPGO maximize performance by default, set performance to its negative value for minimization
		# GPGO maximize performance by default
		# set performance to its negative value for minimization
		if use_max:
		return score
		else:
		return -score

		########################

		from pyGPGO.covfunc import matern32
		from pyGPGO.acquisition import Acquisition
		from pyGPGO.surrogates.GaussianProcess import GaussianProcess
		from pyGPGO.GPGO import GPGO
		# execute GPGO
		cov = matern32()
		gp = GaussianProcess(cov)
		acq = Acquisition(mode='ExpectedImprovement')
		@@ -300,7 +309,8 @@ class GaussianProcessHyperparamOpt(HyperparamOpt):
		if param_range[hp][0] == "int":
		hyper_parameters[hp] = int(hp_opt[hp])
		else:
		hyper_parameters[hp] = float(hp_opt[hp])
		# Incompatible types in assignment
		hyper_parameters[hp] = float(hp_opt[hp]) # type: ignore
		hp_str = _convert_hyperparam_dict_to_filename(hyper_parameters)

		# Let's fetch the model with the best parameters

deepchem/hyper/grid_search.py

+24 −12

Original line number	Diff line number	Diff line
		@@ -10,6 +10,11 @@ import collections
		import logging
		from functools import reduce
		from operator import mul
		from typing import Dict, List, Optional

		from deepchem.data import Dataset
		from deepchem.trans import Transformer
		from deepchem.metrics import Metric
		from deepchem.utils.evaluate import Evaluator
		from deepchem.hyper.base_classes import HyperparamOpt
		from deepchem.hyper.base_classes import _convert_hyperparam_dict_to_filename
		@@ -55,14 +60,18 @@ class GridHyperparamOpt(HyperparamOpt):

		"""

		def hyperparam_search(self,
		params_dict,
		train_dataset,
		valid_dataset,
		output_transformers,
		metric,
		use_max=True,
		logdir=None):
		# NOTE: mypy prohibits changing the number of arguments
		# FIXME: Signature of "hyperparam_search" incompatible with supertype "HyperparamOpt"
		def hyperparam_search( # type: ignore[override]
		self,
		params_dict: Dict[str, List],
		train_dataset: Dataset,
		valid_dataset: Dataset,
		output_transformers: List[Transformer],
		metric: Metric,
		use_max: bool = True,
		logdir: Optional[str] = None,
		):
		"""Perform hyperparams search according to params_dict.

		Each key to hyperparams_dict is a model_param. The values should
		@@ -70,15 +79,13 @@ class GridHyperparamOpt(HyperparamOpt):

		Parameters
		----------
		params_dict: Dict[str, list]
		params_dict: Dict
		Maps hyperparameter names (strings) to lists of possible
		parameter values.
		train_dataset: `dc.data.Dataset`
		dataset used for training
		valid_dataset: `dc.data.Dataset`
		dataset used for validation(optimization on valid scores)
		output_transformers: list[dc.trans.Transformer]
		transformers for evaluation
		metric: dc.metrics.Metric
		metric used for evaluation
		use_max: bool, optional
		@@ -87,11 +94,16 @@ class GridHyperparamOpt(HyperparamOpt):
		logdir: str, optional
		The directory in which to store created models. If not set, will
		use a temporary directory.
		output_transformers: list[dc.trans.Transformer]
		Transformers for evaluation. This argument is needed since
		`train_dataset` and `valid_dataset` may have been transformed
		for learning and need the transform to be inverted before
		the metric can be evaluated on a model.

		Returns
		-------
		`(best_model, best_hyperparams, all_scores)` where `best_model` is
		an instance of `dc.model.Models`, `best_hyperparams` is a
		an instance of `dc.model.Model`, `best_hyperparams` is a
		dictionary of parameters, and `all_scores` is a dictionary mapping
		string representations of hyperparameter sets to validation
		scores.

deepchem/hyper/tests/test_gaussian_hyperparam_opt.py

+6 −6

Original line number	Diff line number	Diff line
		@@ -108,10 +108,10 @@ class TestGaussianHyperparamOpt(unittest.TestCase):
		np.random.rand(5, 3), np.zeros((5, 2)), np.ones((5, 2)), np.arange(5))

		optimizer = dc.hyper.GaussianProcessHyperparamOpt(
		lambda **p: dc.models.MultitaskRegressor(n_tasks=2,
		lambda **params: dc.models.MultitaskRegressor(n_tasks=2,
		n_features=3, dropouts=[0.],
		weight_init_stddevs=[np.sqrt(6) / np.sqrt(1000)],
		learning_rate=0.003, **p))
		learning_rate=0.003, **params))

		params_dict = {"batch_size": 10}
		transformers = []
		@@ -143,12 +143,12 @@ class TestGaussianHyperparamOpt(unittest.TestCase):
		np.random.rand(5, 3), np.zeros((5, 2)), np.ones((5, 2)), np.arange(5))

		optimizer = dc.hyper.GaussianProcessHyperparamOpt(
		lambda **p: dc.models.MultitaskRegressor(
		lambda **params: dc.models.MultitaskRegressor(
		n_tasks=2,
		n_features=3,
		dropouts=[0.],
		weight_init_stddevs=[np.sqrt(6) / np.sqrt(1000)],
		**p))
		**params))

		params_dict = {"learning_rate": 0.003, "batch_size": 10}
		# These are per-example multiplier

Admin message