Unverified Commit bc05e7c1 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #2110 from nd-02110114/model-update

Update sklearn and xgboost model type annotation
parents de78012e 78c35b93
Loading
Loading
Loading
Loading
+10 −4
Original line number Diff line number Diff line
@@ -2,11 +2,8 @@
Gathers all models in one place for convenient imports
"""
# flake8: noqa

from deepchem.models.models import Model
from deepchem.models.keras_model import KerasModel
from deepchem.models.sklearn_models import SklearnModel
from deepchem.models.xgboost_models import XGBoostModel
from deepchem.models.multitask import SingletaskToMultitask
from deepchem.models.callbacks import ValidationCallback

@@ -27,6 +24,15 @@ from deepchem.models.text_cnn import TextCNNModel
from deepchem.models.atomic_conv import AtomicConvModel
from deepchem.models.chemnet_models import Smiles2Vec, ChemCeption

# scikit-learn model
from deepchem.models.sklearn_models import SklearnModel

# XGBoost model
try:
  from deepchem.models.xgboost_models import XGBoostModel
except ModuleNotFoundError:
  pass

# PyTorch models
try:
  from deepchem.models.torch_models import TorchModel
+23 −38
Original line number Diff line number Diff line
@@ -2,28 +2,20 @@
Contains an abstract base class that supports different ML models.
"""

import sys
import numpy as np
import pandas as pd
import joblib
import os
import shutil
import tempfile
import sklearn
import logging
from typing import Any, List, Optional, Sequence

import numpy as np
from sklearn.base import BaseEstimator

import logging
from deepchem.data import Dataset, pad_features
from deepchem.data import Dataset
from deepchem.metrics import Metric
from deepchem.trans import Transformer, undo_transforms
from deepchem.utils.save import load_from_disk
from deepchem.utils.save import save_to_disk
from deepchem.utils.evaluate import Evaluator

from typing import Any, Dict, List, Optional, Sequence
from deepchem.utils.typing import OneOrMany

logger = logging.getLogger(__name__)


@@ -33,7 +25,7 @@ class Model(BaseEstimator):
  """

  def __init__(self,
               model_instance: Optional[Any] = None,
               model_instance=None,
               model_dir: Optional[str] = None,
               **kwargs) -> None:
    """Abstract class for all models.
@@ -51,8 +43,8 @@ class Model(BaseEstimator):
    """
    if self.__class__.__name__ == "Model":
      raise ValueError(
          "This constructor is for an abstract class and should never be called directly. Can only call from subclass constructors."
      )
          "This constructor is for an abstract class and should never be called directly."
          "Can only call from subclass constructors.")
    self.model_dir_is_temp = False
    if model_dir is not None:
      if not os.path.exists(model_dir):
@@ -68,21 +60,17 @@ class Model(BaseEstimator):
    if 'model_dir_is_temp' in dir(self) and self.model_dir_is_temp:
      shutil.rmtree(self.model_dir)

  def fit_on_batch(self, X: Sequence, y: Sequence, w: Sequence) -> float:
  def fit_on_batch(self, X: Sequence, y: Sequence, w: Sequence):
    """Perform a single step of training.

    Parameters
    ----------
    X: ndarray
    X: np.ndarray
      the inputs for the batch
    y: ndarray
    y: np.ndarray
      the labels for the batch
    w: ndarray
    w: np.ndarray
      the weights for the batch

    Returns
    -------
    the loss on the batch
    """
    raise NotImplementedError(
        "Each model is responsible for its own fit_on_batch method.")
@@ -140,6 +128,7 @@ class Model(BaseEstimator):

    Returns
    -------
    float
      The average loss over the most recent checkpoint interval.
    """
    for epoch in range(nb_epoch):
@@ -152,28 +141,24 @@ class Model(BaseEstimator):
    return np.array(losses).mean()

  def predict(self, dataset: Dataset,
              transformers: List[Transformer] = []) -> OneOrMany[np.ndarray]:
              transformers: List[Transformer] = []) -> np.ndarray:
    """
    Uses self to make predictions on provided Dataset object.


    Parameters
    ----------
    dataset: dc.data.Dataset
    dataset: Dataset
      Dataset to make prediction on
    transformers: list of dc.trans.Transformers
    transformers: List[Transformer]
      Transformers that the input data has been transformed by. The output
      is passed through these transformers to undo the transformations.

    Returns
    -------
    a NumPy array of the model produces a single output, or a list of arrays
    if it produces multiple outputs
    np.ndarray
      A numpy array of predictions the model produces.
    """
    y_preds = []
    n_tasks = self.get_num_tasks()
    ind = 0

    for (X_batch, _, _, ids_batch) in dataset.iterbatches(deterministic=True):
      n_samples = len(X_batch)
      y_pred_batch = self.predict_on_batch(X_batch)
@@ -205,9 +190,9 @@ class Model(BaseEstimator):

    Parameters
    ----------
    dataset: `dc.data.Dataset`
    dataset: Dataset
      Dataset object.
    metrics: dc.metrics.Metric/list[dc.metrics.Metric]/function
    metrics: Metric / List[Metric] / function
      The set of metrics provided. This class attempts to do some
      intelligent handling of input. If a single `dc.metrics.Metric`
      object is provided or a list is provided, it will evaluate
@@ -218,11 +203,11 @@ class Model(BaseEstimator):
      `np.ndarray` objects and return a floating point score. The
      metric function may also accept a keyword argument
      `sample_weight` to account for per-sample weights.
    transformers: list
    transformers: List[Transformer]
      List of `dc.trans.Transformer` objects. These transformations
      must have been applied to `dataset` previously. The dataset will
      be untransformed for metric evaluation.
    per_task_metrics: bool, optional
    per_task_metrics: bool, optional (default False)
      If true, return computed metric for each task on multitask dataset.
    use_sample_weights: bool, optional (default False)
      If set, use per-sample weights `w`.
+2 −119
Original line number Diff line number Diff line
"""
Code for processing datasets using scikit-learn.
"""
import numpy as np
import logging
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import LogisticRegression, BayesianRidge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LassoLarsCV
from deepchem.models import Model
from deepchem.utils.save import load_from_disk
from deepchem.utils.save import save_to_disk

NON_WEIGHTED_MODELS = [
    LogisticRegression, PLSRegression, GaussianProcessRegressor, ElasticNetCV,
    LassoCV, BayesianRidge
]

logger = logging.getLogger(__name__)


class SklearnModel(Model):
  """Wrapper class that wraps scikit-learn models as DeepChem models.

  When you're working with scikit-learn and DeepChem, at times it can
  be useful to wrap a scikit-learn model as a DeepChem model. The
  reason for this might be that you want to do an apples-to-apples
  comparison of a scikit-learn model to another DeepChem model, or
  perhaps you want to use the hyperparameter tuning capabilities in
  `dc.hyper`. The `SklearnModel` class provides a wrapper around scikit-learn
  models that allows scikit-learn models to be trained on `Dataset` objects
  and evaluated with the same metrics as other DeepChem models.`

  Note
  ----
  All `SklearnModels` perform learning solely in memory. This means that it
  may not be possible to train `SklearnModel` on large `Dataset`s.
  """

  def __init__(self, model_instance=None, model_dir=None, **kwargs):
    """
    Parameters
    ----------
    model_instance: `sklearn.base.BaseEstimator`
      Must be a scikit-learn `BaseEstimator Class`.
    model_dir: str, optional (default None)
      If specified the model will be stored in this directory. Else, a
      temporary directory will be used.
    kwargs: dict
      kwargs['use_weights'] is a bool which determines if we pass weights into
      self.model_instance.fit()
    """
    super(SklearnModel, self).__init__(model_instance, model_dir, **kwargs)
    if 'use_weights' in kwargs:
      self.use_weights = kwargs['use_weights']
    else:
      self.use_weights = True
    for model_instance in NON_WEIGHTED_MODELS:
      if isinstance(self.model_instance, model_instance):
        self.use_weights = False

  def fit(self, dataset, **kwargs):
    """Fits SKLearn model to data.

    Parameters
    ----------
    dataset: `Dataset`
      The `Dataset` to train this model on.
    """
    X = dataset.X
    y = np.squeeze(dataset.y)
    w = np.squeeze(dataset.w)
    # Some scikit-learn models don't use weights.
    if self.use_weights:
      self.model_instance.fit(X, y, w)
      return
    self.model_instance.fit(X, y)

  def predict_on_batch(self, X, pad_batch=False):
    """
    Makes predictions on batch of data.

    Parameters
    ----------
    X: np.ndarray
      Features
    pad_batch: bool, optional
      Ignored for Sklearn Model. Only used for Tensorflow models
      with rigid batch-size requirements.
    """
    try:
      return self.model_instance.predict_proba(X)
    except AttributeError:
      return self.model_instance.predict(X)

  def predict(self, X, transformers=[]):
    """
    Makes predictions on dataset.
    """
    return super(SklearnModel, self).predict(X, transformers)

  def save(self):
    """Saves sklearn model to disk using joblib."""
    save_to_disk(self.model_instance, self.get_model_filename(self.model_dir))

  def reload(self):
    """Loads sklearn model from joblib file on disk."""
    self.model_instance = load_from_disk(
        Model.get_model_filename(self.model_dir))

  def get_num_tasks(self):
    """Number of tasks for this model. Defaults to 1"""
    return 1
# flake8: ignore
from deepchem.models.sklearn_models.sklearn_model import SklearnModel
+132 −0
Original line number Diff line number Diff line
"""
Code for processing datasets using scikit-learn.
"""
import logging
from typing import List, Optional

import numpy as np
from sklearn.base import BaseEstimator
from sklearn.cross_decomposition import PLSRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import LogisticRegression, BayesianRidge
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV

from deepchem.models import Model
from deepchem.data import Dataset
from deepchem.trans import Transformer
from deepchem.utils.save import load_from_disk, save_to_disk

NON_WEIGHTED_MODELS = [
    LogisticRegression, PLSRegression, GaussianProcessRegressor, ElasticNetCV,
    LassoCV, BayesianRidge
]

logger = logging.getLogger(__name__)


class SklearnModel(Model):
  """Wrapper class that wraps scikit-learn models as DeepChem models.

  When you're working with scikit-learn and DeepChem, at times it can
  be useful to wrap a scikit-learn model as a DeepChem model. The
  reason for this might be that you want to do an apples-to-apples
  comparison of a scikit-learn model to another DeepChem model, or
  perhaps you want to use the hyperparameter tuning capabilities in
  `dc.hyper`. The `SklearnModel` class provides a wrapper around scikit-learn
  models that allows scikit-learn models to be trained on `Dataset` objects
  and evaluated with the same metrics as other DeepChem models.`

  Notes
  -----
  All `SklearnModels` perform learning solely in memory. This means that it
  may not be possible to train `SklearnModel` on large `Dataset`s.
  """

  def __init__(self,
               model_instance: BaseEstimator,
               model_dir: Optional[str] = None,
               **kwargs):
    """
    Parameters
    ----------
    model_instance: BaseEstimator
      The model instance which inherits a scikit-learn `BaseEstimator` Class.
    model_dir: str, optional (default None)
      If specified the model will be stored in this directory. Else, a
      temporary directory will be used.
    kwargs: dict
      kwargs['use_weights'] is a bool which determines if we pass weights into
      self.model_instance.fit().
    """
    super(SklearnModel, self).__init__(model_instance, model_dir, **kwargs)
    if 'use_weights' in kwargs:
      self.use_weights = kwargs['use_weights']
    else:
      self.use_weights = True
    for model_instance in NON_WEIGHTED_MODELS:
      if isinstance(self.model_instance, model_instance):
        self.use_weights = False

  # FIXME: Return type "None" of "fit" incompatible with return type "float" in supertype "Model"
  def fit(self, dataset: Dataset, **kwargs) -> None:  # type: ignore[override]
    """Fits scikit-learn model to data.

    Parameters
    ----------
    dataset: Dataset
      The `Dataset` to train this model on.
    """
    X = dataset.X
    y = np.squeeze(dataset.y)
    w = np.squeeze(dataset.w)
    # Some scikit-learn models don't use weights.
    if self.use_weights:
      self.model_instance.fit(X, y, w)
      return
    self.model_instance.fit(X, y)

  def predict_on_batch(self, X: np.ndarray) -> np.ndarray:
    """Makes predictions on batch of data.

    Parameters
    ----------
    X: np.ndarray
      A numpy array of features.

    Returns
    -------
    np.ndarray
      The value is a return value of `predict_proba` or `predict` method
      of the scikit-learn model. If the scikit-learn model has both methods,
      the value is always a return value of `predict_proba`.
    """
    try:
      # FIXME: BaseEstimator doesn't guarantee the class has `predict_proba` method.
      return self.model_instance.predict_proba(X)  # type: ignore
    except AttributeError:
      # FIXME: BaseEstimator doesn't guarantee the class has `predict` method.
      return self.model_instance.predict(X)  # type: ignore

  def predict(self, X: Dataset,
              transformers: List[Transformer] = []) -> np.ndarray:
    """Makes predictions on dataset.

    Parameters
    ----------
    dataset: Dataset
      Dataset to make prediction on.
    transformers: List[Transformer]
      Transformers that the input data has been transformed by. The output
      is passed through these transformers to undo the transformations.
    """
    return super(SklearnModel, self).predict(X, transformers)

  def save(self):
    """Saves scikit-learn model to disk using joblib."""
    save_to_disk(self.model_instance, self.get_model_filename(self.model_dir))

  def reload(self):
    """Loads scikit-learn model from joblib file on disk."""
    self.model_instance = load_from_disk(
        self.get_model_filename(self.model_dir))
+2 −129
Original line number Diff line number Diff line
"""
Scikit-learn wrapper interface of xgboost
"""

import numpy as np
import os
import logging
from deepchem.models import Model
from deepchem.models.sklearn_models import SklearnModel
from deepchem.utils.save import load_from_disk
from deepchem.utils.save import save_to_disk
from sklearn.model_selection import train_test_split, GridSearchCV
import tempfile

logger = logging.getLogger(__name__)


class XGBoostModel(SklearnModel):
  """
  Abstract base class for XGBoost model.
  """

  def __init__(self, model_instance=None, model_dir=None, **kwargs):
    """Abstract class for XGBoost models.

    Parameters
    ----------
    model_instance: object
      Scikit-learn wrapper interface of xgboost
    model_dir: str
      Path to directory where model will be stored.
    """
    if model_dir is not None:
      if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    else:
      model_dir = tempfile.mkdtemp()
    self.model_dir = model_dir
    self.model_instance = model_instance
    self.model_class = model_instance.__class__

    if 'early_stopping_rounds' in kwargs:
      self.early_stopping_rounds = kwargs['early_stopping_rounds']
    else:
      self.early_stopping_rounds = 50

  def fit(self, dataset, **kwargs):
    """
    Fits XGBoost model to data.
    """
    X = dataset.X
    y = np.squeeze(dataset.y)
    w = np.squeeze(dataset.w)
    seed = self.model_instance.random_state
    import xgboost as xgb
    if isinstance(self.model_instance, xgb.XGBClassifier):
      xgb_metric = "auc"
      sklearn_metric = "roc_auc"
      stratify = y
    elif isinstance(self.model_instance, xgb.XGBRegressor):
      xgb_metric = "mae"
      sklearn_metric = "neg_mean_absolute_error"
      stratify = None
    best_param = self._search_param(sklearn_metric, X, y)
    # update model with best param
    self.model_instance = self.model_class(**best_param)

    # Find optimal n_estimators based on original learning_rate
    # and early_stopping_rounds
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=stratify)

    self.model_instance.fit(
        X_train,
        y_train,
        early_stopping_rounds=self.early_stopping_rounds,
        eval_metric=xgb_metric,
        eval_set=[(X_train, y_train), (X_test, y_test)])

    # Since test size is 20%, when retrain model to whole data, expect
    # n_estimator increased to 1/0.8 = 1.25 time.
    estimated_best_round = np.round(self.model_instance.best_ntree_limit * 1.25)
    self.model_instance.n_estimators = np.int64(estimated_best_round)
    self.model_instance.fit(X, y, eval_metric=xgb_metric)

  def _search_param(self, metric, X, y):
    '''
    Find best potential parameters set using few n_estimators
    '''

    # Make sure user specified params are in the grid.

    def unique_not_none(values):
      return list(np.unique([x for x in values if x is not None]))

    max_depth_grid = unique_not_none([self.model_instance.max_depth, 5, 7])
    colsample_bytree_grid = unique_not_none(
        [self.model_instance.colsample_bytree, 0.66, 0.9])
    reg_lambda_grid = unique_not_none([self.model_instance.reg_lambda, 1, 5])
    learning_rate = 0.3
    if self.model_instance.learning_rate is not None:
      learning_rate = max(learning_rate, self.model_instance.learning_rate)
    n_estimators = 60
    if self.model_instance.n_estimators is not None:
      n_estimators = min(n_estimators, self.model_instance.n_estimators)
    param_grid = {
        'max_depth': max_depth_grid,
        'learning_rate': [learning_rate],
        'n_estimators': [n_estimators],
        'gamma': [self.model_instance.gamma],
        'min_child_weight': [self.model_instance.min_child_weight],
        'max_delta_step': [self.model_instance.max_delta_step],
        'subsample': [self.model_instance.subsample],
        'colsample_bytree': colsample_bytree_grid,
        'colsample_bylevel': [self.model_instance.colsample_bylevel],
        'reg_alpha': [self.model_instance.reg_alpha],
        'reg_lambda': reg_lambda_grid,
        'scale_pos_weight': [self.model_instance.scale_pos_weight],
        'base_score': [self.model_instance.base_score],
        'seed': [self.model_instance.random_state]
    }
    grid_search = GridSearchCV(
        self.model_instance, param_grid, cv=2, refit=False, scoring=metric)
    grid_search.fit(X, y)
    best_params = grid_search.best_params_
    # Change params back original params
    best_params['learning_rate'] = self.model_instance.learning_rate
    best_params['n_estimators'] = self.model_instance.n_estimators
    return best_params
# flake8: noqa
from deepchem.models.xgboost_models.xgboost_model import XGBoostModel
Loading