Merge pull request #2110 from nd-02110114/model-update (bc05e7c1) · Commits · 钟慕尧 / deepchem

deepchem/models/init.py

+10 −4

Original line number	Diff line number	Diff line
		@@ -2,11 +2,8 @@
		Gathers all models in one place for convenient imports
		"""
		# flake8: noqa

		from deepchem.models.models import Model
		from deepchem.models.keras_model import KerasModel
		from deepchem.models.sklearn_models import SklearnModel
		from deepchem.models.xgboost_models import XGBoostModel
		from deepchem.models.multitask import SingletaskToMultitask
		from deepchem.models.callbacks import ValidationCallback

		@@ -27,6 +24,15 @@ from deepchem.models.text_cnn import TextCNNModel
		from deepchem.models.atomic_conv import AtomicConvModel
		from deepchem.models.chemnet_models import Smiles2Vec, ChemCeption

		# scikit-learn model
		from deepchem.models.sklearn_models import SklearnModel

		# XGBoost model
		try:
		from deepchem.models.xgboost_models import XGBoostModel
		except ModuleNotFoundError:
		pass

		# PyTorch models
		try:
		from deepchem.models.torch_models import TorchModel

deepchem/models/models.py

+23 −38

Original line number	Diff line number	Diff line
		@@ -2,28 +2,20 @@
		Contains an abstract base class that supports different ML models.
		"""

		import sys
		import numpy as np
		import pandas as pd
		import joblib
		import os
		import shutil
		import tempfile
		import sklearn
		import logging
		from typing import Any, List, Optional, Sequence

		import numpy as np
		from sklearn.base import BaseEstimator

		import logging
		from deepchem.data import Dataset, pad_features
		from deepchem.data import Dataset
		from deepchem.metrics import Metric
		from deepchem.trans import Transformer, undo_transforms
		from deepchem.utils.save import load_from_disk
		from deepchem.utils.save import save_to_disk
		from deepchem.utils.evaluate import Evaluator

		from typing import Any, Dict, List, Optional, Sequence
		from deepchem.utils.typing import OneOrMany

		logger = logging.getLogger(__name__)


		@@ -33,7 +25,7 @@ class Model(BaseEstimator):
		"""

		def __init__(self,
		model_instance: Optional[Any] = None,
		model_instance=None,
		model_dir: Optional[str] = None,
		**kwargs) -> None:
		"""Abstract class for all models.
		@@ -51,8 +43,8 @@ class Model(BaseEstimator):
		"""
		if self.__class__.__name__ == "Model":
		raise ValueError(
		"This constructor is for an abstract class and should never be called directly. Can only call from subclass constructors."
		)
		"This constructor is for an abstract class and should never be called directly."
		"Can only call from subclass constructors.")
		self.model_dir_is_temp = False
		if model_dir is not None:
		if not os.path.exists(model_dir):
		@@ -68,21 +60,17 @@ class Model(BaseEstimator):
		if 'model_dir_is_temp' in dir(self) and self.model_dir_is_temp:
		shutil.rmtree(self.model_dir)

		def fit_on_batch(self, X: Sequence, y: Sequence, w: Sequence) -> float:
		def fit_on_batch(self, X: Sequence, y: Sequence, w: Sequence):
		"""Perform a single step of training.

		Parameters
		----------
		X: ndarray
		X: np.ndarray
		the inputs for the batch
		y: ndarray
		y: np.ndarray
		the labels for the batch
		w: ndarray
		w: np.ndarray
		the weights for the batch

		Returns
		-------
		the loss on the batch
		"""
		raise NotImplementedError(
		"Each model is responsible for its own fit_on_batch method.")
		@@ -140,6 +128,7 @@ class Model(BaseEstimator):

		Returns
		-------
		float
		The average loss over the most recent checkpoint interval.
		"""
		for epoch in range(nb_epoch):
		@@ -152,28 +141,24 @@ class Model(BaseEstimator):
		return np.array(losses).mean()

		def predict(self, dataset: Dataset,
		transformers: List[Transformer] = []) -> OneOrMany[np.ndarray]:
		transformers: List[Transformer] = []) -> np.ndarray:
		"""
		Uses self to make predictions on provided Dataset object.


		Parameters
		----------
		dataset: dc.data.Dataset
		dataset: Dataset
		Dataset to make prediction on
		transformers: list of dc.trans.Transformers
		transformers: List[Transformer]
		Transformers that the input data has been transformed by. The output
		is passed through these transformers to undo the transformations.

		Returns
		-------
		a NumPy array of the model produces a single output, or a list of arrays
		if it produces multiple outputs
		np.ndarray
		A numpy array of predictions the model produces.
		"""
		y_preds = []
		n_tasks = self.get_num_tasks()
		ind = 0

		for (X_batch, _, _, ids_batch) in dataset.iterbatches(deterministic=True):
		n_samples = len(X_batch)
		y_pred_batch = self.predict_on_batch(X_batch)
		@@ -205,9 +190,9 @@ class Model(BaseEstimator):

		Parameters
		----------
		dataset: `dc.data.Dataset`
		dataset: Dataset
		Dataset object.
		metrics: dc.metrics.Metric/list[dc.metrics.Metric]/function
		metrics: Metric / List[Metric] / function
		The set of metrics provided. This class attempts to do some
		intelligent handling of input. If a single `dc.metrics.Metric`
		object is provided or a list is provided, it will evaluate
		@@ -218,11 +203,11 @@ class Model(BaseEstimator):
		`np.ndarray` objects and return a floating point score. The
		metric function may also accept a keyword argument
		`sample_weight` to account for per-sample weights.
		transformers: list
		transformers: List[Transformer]
		List of `dc.trans.Transformer` objects. These transformations
		must have been applied to `dataset` previously. The dataset will
		be untransformed for metric evaluation.
		per_task_metrics: bool, optional
		per_task_metrics: bool, optional (default False)
		If true, return computed metric for each task on multitask dataset.
		use_sample_weights: bool, optional (default False)
		If set, use per-sample weights `w`.

deepchem/models/sklearn_models/init.py

+2 −119

Original line number	Diff line number	Diff line
		"""
		Code for processing datasets using scikit-learn.
		"""
		import numpy as np
		import logging
		from sklearn.cross_decomposition import PLSRegression
		from sklearn.ensemble import RandomForestClassifier
		from sklearn.ensemble import RandomForestRegressor
		from sklearn.gaussian_process import GaussianProcessRegressor
		from sklearn.linear_model import LogisticRegression, BayesianRidge
		from sklearn.linear_model import LinearRegression
		from sklearn.linear_model import RidgeCV
		from sklearn.linear_model import LassoCV
		from sklearn.linear_model import ElasticNetCV
		from sklearn.linear_model import LassoLarsCV
		from deepchem.models import Model
		from deepchem.utils.save import load_from_disk
		from deepchem.utils.save import save_to_disk

		NON_WEIGHTED_MODELS = [
		LogisticRegression, PLSRegression, GaussianProcessRegressor, ElasticNetCV,
		LassoCV, BayesianRidge
		]

		logger = logging.getLogger(__name__)


		class SklearnModel(Model):
		"""Wrapper class that wraps scikit-learn models as DeepChem models.

		When you're working with scikit-learn and DeepChem, at times it can
		be useful to wrap a scikit-learn model as a DeepChem model. The
		reason for this might be that you want to do an apples-to-apples
		comparison of a scikit-learn model to another DeepChem model, or
		perhaps you want to use the hyperparameter tuning capabilities in
		`dc.hyper`. The `SklearnModel` class provides a wrapper around scikit-learn
		models that allows scikit-learn models to be trained on `Dataset` objects
		and evaluated with the same metrics as other DeepChem models.`

		Note
		----
		All `SklearnModels` perform learning solely in memory. This means that it
		may not be possible to train `SklearnModel` on large `Dataset`s.
		"""

		def __init__(self, model_instance=None, model_dir=None, **kwargs):
		"""
		Parameters
		----------
		model_instance: `sklearn.base.BaseEstimator`
		Must be a scikit-learn `BaseEstimator Class`.
		model_dir: str, optional (default None)
		If specified the model will be stored in this directory. Else, a
		temporary directory will be used.
		kwargs: dict
		kwargs['use_weights'] is a bool which determines if we pass weights into
		self.model_instance.fit()
		"""
		super(SklearnModel, self).__init__(model_instance, model_dir, **kwargs)
		if 'use_weights' in kwargs:
		self.use_weights = kwargs['use_weights']
		else:
		self.use_weights = True
		for model_instance in NON_WEIGHTED_MODELS:
		if isinstance(self.model_instance, model_instance):
		self.use_weights = False

		def fit(self, dataset, **kwargs):
		"""Fits SKLearn model to data.

		Parameters
		----------
		dataset: `Dataset`
		The `Dataset` to train this model on.
		"""
		X = dataset.X
		y = np.squeeze(dataset.y)
		w = np.squeeze(dataset.w)
		# Some scikit-learn models don't use weights.
		if self.use_weights:
		self.model_instance.fit(X, y, w)
		return
		self.model_instance.fit(X, y)

		def predict_on_batch(self, X, pad_batch=False):
		"""
		Makes predictions on batch of data.

		Parameters
		----------
		X: np.ndarray
		Features
		pad_batch: bool, optional
		Ignored for Sklearn Model. Only used for Tensorflow models
		with rigid batch-size requirements.
		"""
		try:
		return self.model_instance.predict_proba(X)
		except AttributeError:
		return self.model_instance.predict(X)

		def predict(self, X, transformers=[]):
		"""
		Makes predictions on dataset.
		"""
		return super(SklearnModel, self).predict(X, transformers)

		def save(self):
		"""Saves sklearn model to disk using joblib."""
		save_to_disk(self.model_instance, self.get_model_filename(self.model_dir))

		def reload(self):
		"""Loads sklearn model from joblib file on disk."""
		self.model_instance = load_from_disk(
		Model.get_model_filename(self.model_dir))

		def get_num_tasks(self):
		"""Number of tasks for this model. Defaults to 1"""
		return 1
		# flake8: ignore
		from deepchem.models.sklearn_models.sklearn_model import SklearnModel

deepchem/models/sklearn_models/sklearn_model.py

0 → 100644

+132 −0

Original line number	Diff line number	Diff line
		"""
		Code for processing datasets using scikit-learn.
		"""
		import logging
		from typing import List, Optional

		import numpy as np
		from sklearn.base import BaseEstimator
		from sklearn.cross_decomposition import PLSRegression
		from sklearn.gaussian_process import GaussianProcessRegressor
		from sklearn.linear_model import LogisticRegression, BayesianRidge
		from sklearn.linear_model import LassoCV
		from sklearn.linear_model import ElasticNetCV

		from deepchem.models import Model
		from deepchem.data import Dataset
		from deepchem.trans import Transformer
		from deepchem.utils.save import load_from_disk, save_to_disk

		NON_WEIGHTED_MODELS = [
		LogisticRegression, PLSRegression, GaussianProcessRegressor, ElasticNetCV,
		LassoCV, BayesianRidge
		]

		logger = logging.getLogger(__name__)


		class SklearnModel(Model):
		"""Wrapper class that wraps scikit-learn models as DeepChem models.

		When you're working with scikit-learn and DeepChem, at times it can
		be useful to wrap a scikit-learn model as a DeepChem model. The
		reason for this might be that you want to do an apples-to-apples
		comparison of a scikit-learn model to another DeepChem model, or
		perhaps you want to use the hyperparameter tuning capabilities in
		`dc.hyper`. The `SklearnModel` class provides a wrapper around scikit-learn
		models that allows scikit-learn models to be trained on `Dataset` objects
		and evaluated with the same metrics as other DeepChem models.`

		Notes
		-----
		All `SklearnModels` perform learning solely in memory. This means that it
		may not be possible to train `SklearnModel` on large `Dataset`s.
		"""

		def __init__(self,
		model_instance: BaseEstimator,
		model_dir: Optional[str] = None,
		**kwargs):
		"""
		Parameters
		----------
		model_instance: BaseEstimator
		The model instance which inherits a scikit-learn `BaseEstimator` Class.
		model_dir: str, optional (default None)
		If specified the model will be stored in this directory. Else, a
		temporary directory will be used.
		kwargs: dict
		kwargs['use_weights'] is a bool which determines if we pass weights into
		self.model_instance.fit().
		"""
		super(SklearnModel, self).__init__(model_instance, model_dir, **kwargs)
		if 'use_weights' in kwargs:
		self.use_weights = kwargs['use_weights']
		else:
		self.use_weights = True
		for model_instance in NON_WEIGHTED_MODELS:
		if isinstance(self.model_instance, model_instance):
		self.use_weights = False

		# FIXME: Return type "None" of "fit" incompatible with return type "float" in supertype "Model"
		def fit(self, dataset: Dataset, **kwargs) -> None: # type: ignore[override]
		"""Fits scikit-learn model to data.

		Parameters
		----------
		dataset: Dataset
		The `Dataset` to train this model on.
		"""
		X = dataset.X
		y = np.squeeze(dataset.y)
		w = np.squeeze(dataset.w)
		# Some scikit-learn models don't use weights.
		if self.use_weights:
		self.model_instance.fit(X, y, w)
		return
		self.model_instance.fit(X, y)

		def predict_on_batch(self, X: np.ndarray) -> np.ndarray:
		"""Makes predictions on batch of data.

		Parameters
		----------
		X: np.ndarray
		A numpy array of features.

		Returns
		-------
		np.ndarray
		The value is a return value of `predict_proba` or `predict` method
		of the scikit-learn model. If the scikit-learn model has both methods,
		the value is always a return value of `predict_proba`.
		"""
		try:
		# FIXME: BaseEstimator doesn't guarantee the class has `predict_proba` method.
		return self.model_instance.predict_proba(X) # type: ignore
		except AttributeError:
		# FIXME: BaseEstimator doesn't guarantee the class has `predict` method.
		return self.model_instance.predict(X) # type: ignore

		def predict(self, X: Dataset,
		transformers: List[Transformer] = []) -> np.ndarray:
		"""Makes predictions on dataset.

		Parameters
		----------
		dataset: Dataset
		Dataset to make prediction on.
		transformers: List[Transformer]
		Transformers that the input data has been transformed by. The output
		is passed through these transformers to undo the transformations.
		"""
		return super(SklearnModel, self).predict(X, transformers)

		def save(self):
		"""Saves scikit-learn model to disk using joblib."""
		save_to_disk(self.model_instance, self.get_model_filename(self.model_dir))

		def reload(self):
		"""Loads scikit-learn model from joblib file on disk."""
		self.model_instance = load_from_disk(
		self.get_model_filename(self.model_dir))

deepchem/models/xgboost_models/init.py

+2 −129

Original line number	Diff line number	Diff line
		"""
		Scikit-learn wrapper interface of xgboost
		"""

		import numpy as np
		import os
		import logging
		from deepchem.models import Model
		from deepchem.models.sklearn_models import SklearnModel
		from deepchem.utils.save import load_from_disk
		from deepchem.utils.save import save_to_disk
		from sklearn.model_selection import train_test_split, GridSearchCV
		import tempfile

		logger = logging.getLogger(__name__)


		class XGBoostModel(SklearnModel):
		"""
		Abstract base class for XGBoost model.
		"""

		def __init__(self, model_instance=None, model_dir=None, **kwargs):
		"""Abstract class for XGBoost models.

		Parameters
		----------
		model_instance: object
		Scikit-learn wrapper interface of xgboost
		model_dir: str
		Path to directory where model will be stored.
		"""
		if model_dir is not None:
		if not os.path.exists(model_dir):
		os.makedirs(model_dir)
		else:
		model_dir = tempfile.mkdtemp()
		self.model_dir = model_dir
		self.model_instance = model_instance
		self.model_class = model_instance.__class__

		if 'early_stopping_rounds' in kwargs:
		self.early_stopping_rounds = kwargs['early_stopping_rounds']
		else:
		self.early_stopping_rounds = 50

		def fit(self, dataset, **kwargs):
		"""
		Fits XGBoost model to data.
		"""
		X = dataset.X
		y = np.squeeze(dataset.y)
		w = np.squeeze(dataset.w)
		seed = self.model_instance.random_state
		import xgboost as xgb
		if isinstance(self.model_instance, xgb.XGBClassifier):
		xgb_metric = "auc"
		sklearn_metric = "roc_auc"
		stratify = y
		elif isinstance(self.model_instance, xgb.XGBRegressor):
		xgb_metric = "mae"
		sklearn_metric = "neg_mean_absolute_error"
		stratify = None
		best_param = self._search_param(sklearn_metric, X, y)
		# update model with best param
		self.model_instance = self.model_class(**best_param)

		# Find optimal n_estimators based on original learning_rate
		# and early_stopping_rounds
		X_train, X_test, y_train, y_test = train_test_split(
		X, y, test_size=0.2, random_state=seed, stratify=stratify)

		self.model_instance.fit(
		X_train,
		y_train,
		early_stopping_rounds=self.early_stopping_rounds,
		eval_metric=xgb_metric,
		eval_set=[(X_train, y_train), (X_test, y_test)])

		# Since test size is 20%, when retrain model to whole data, expect
		# n_estimator increased to 1/0.8 = 1.25 time.
		estimated_best_round = np.round(self.model_instance.best_ntree_limit * 1.25)
		self.model_instance.n_estimators = np.int64(estimated_best_round)
		self.model_instance.fit(X, y, eval_metric=xgb_metric)

		def _search_param(self, metric, X, y):
		'''
		Find best potential parameters set using few n_estimators
		'''

		# Make sure user specified params are in the grid.

		def unique_not_none(values):
		return list(np.unique([x for x in values if x is not None]))

		max_depth_grid = unique_not_none([self.model_instance.max_depth, 5, 7])
		colsample_bytree_grid = unique_not_none(
		[self.model_instance.colsample_bytree, 0.66, 0.9])
		reg_lambda_grid = unique_not_none([self.model_instance.reg_lambda, 1, 5])
		learning_rate = 0.3
		if self.model_instance.learning_rate is not None:
		learning_rate = max(learning_rate, self.model_instance.learning_rate)
		n_estimators = 60
		if self.model_instance.n_estimators is not None:
		n_estimators = min(n_estimators, self.model_instance.n_estimators)
		param_grid = {
		'max_depth': max_depth_grid,
		'learning_rate': [learning_rate],
		'n_estimators': [n_estimators],
		'gamma': [self.model_instance.gamma],
		'min_child_weight': [self.model_instance.min_child_weight],
		'max_delta_step': [self.model_instance.max_delta_step],
		'subsample': [self.model_instance.subsample],
		'colsample_bytree': colsample_bytree_grid,
		'colsample_bylevel': [self.model_instance.colsample_bylevel],
		'reg_alpha': [self.model_instance.reg_alpha],
		'reg_lambda': reg_lambda_grid,
		'scale_pos_weight': [self.model_instance.scale_pos_weight],
		'base_score': [self.model_instance.base_score],
		'seed': [self.model_instance.random_state]
		}
		grid_search = GridSearchCV(
		self.model_instance, param_grid, cv=2, refit=False, scoring=metric)
		grid_search.fit(X, y)
		best_params = grid_search.best_params_
		# Change params back original params
		best_params['learning_rate'] = self.model_instance.learning_rate
		best_params['n_estimators'] = self.model_instance.n_estimators
		return best_params
		# flake8: noqa
		from deepchem.models.xgboost_models.xgboost_model import XGBoostModel

Admin message