Commit f664d079 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Fixed some bugs in sklearn model usage.

parent 2860de81
Loading
Loading
Loading
Loading
+18 −13
Original line number Diff line number Diff line
"""
Code for processing datasets using scikit-learn.
"""
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
@@ -10,37 +11,41 @@ from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LassoLarsCV
from deepchem.models import Model
from deepchem.utils.dataset import load_from_disk
from deepchem.utils.dataset import save_to_disk

class SklearnModel(Model):
  """
  Abstract base class for different ML models.
  """
  def __init__(self, task_types, model_params, initialize_raw_model=True):
    super(SklearnModel, self).__init__(task_types, model_params,
                                       initialize_raw_model)
  def __init__(self, model_type, task_types, model_params,
               initialize_raw_model=True):
    super(SklearnModel, self).__init__(
        model_type, task_types, model_params, initialize_raw_model)
    self.task_types = task_types
    self.model_params = model_params
    if initialize_raw_model:
      if self.modeltype == "rf_regressor":
      if self.model_type == "rf_regressor":
        raw_model = RandomForestRegressor(
            n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
      elif self.modeltype == "rf_classifier":
      elif self.model_type == "rf_classifier":
        raw_model = RandomForestClassifier(
            n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
      elif modeltype == "logistic":
      elif self.model_type == "logistic":
        raw_model = LogisticRegression(class_weight="auto")
      elif modeltype == "linear":
      elif self.model_type == "linear":
        raw_model = LinearRegression(normalize=True)
      elif modeltype == "ridge":
      elif self.model_type == "ridge":
        raw_model = RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0], normalize=True)
      elif modeltype == "lasso":
      elif self.model_type == "lasso":
        raw_model = LassoCV(max_iter=2000, n_jobs=-1)
      elif modeltype == "lasso_lars":
      elif self.model_type == "lasso_lars":
        raw_model = LassoLarsCV(max_iter=2000, n_jobs=-1)
      elif modeltype == "elastic_net":
      elif self.model_type == "elastic_net":
        raw_model = ElasticNetCV(max_iter=2000, n_jobs=-1)
      else:
        raise ValueError("Invalid model type provided.")
    self.raw_model = raw_model

  # TODO(rbharath): This does not work with very large datasets! sklearn does
  # support partial_fit, but only for some models. Might make sense to make
@@ -66,11 +71,11 @@ class SklearnModel(Model):
  def save(self, out_dir):
    """Saves sklearn model to disk using joblib."""
    super(SklearnModel, self).save(out_dir)
    joblib.dump(self.raw_model, self.get_model_filename(out_dir))
    save_to_disk(self.raw_model, self.get_model_filename(out_dir))

  def load(self, model_dir):
    """Loads sklearn model from joblib file on disk."""
    self.raw_model = joblib.load(Model.get_model_filename(model_dir))
    self.raw_model = load_from_disk(Model.get_model_filename(model_dir))

Model.register_model_type("logistic", SklearnModel)
Model.register_model_type("rf_classifier", SklearnModel)
+1 −1
Original line number Diff line number Diff line
@@ -282,7 +282,7 @@ def train_test_split(paths, input_transforms, output_transforms,

  print("Loading featurized data.")
  samples_dir = os.path.join(data_dir, "samples")
  samples = FeaturizedSamples(samples_dir, dataset_files)
  samples = FeaturizedSamples(samples_dir, dataset_files, reload=False)
  
  print("Split data into train/test")
  train_samples_dir = os.path.join(data_dir, "train-samples")
+3 −0
Original line number Diff line number Diff line
@@ -37,6 +37,9 @@ class Dataset(object):
      write_dataset_single_partial = partial(
          write_dataset_single, data_dir=self.data_dir,
          feature_types=feature_types)
      print("Dataset()")
      print("samples.compounds_df")
      print(samples.compounds_df)

      metadata_rows = []
      # TODO(rbharath): Still a bit of information leakage.
+13 −3
Original line number Diff line number Diff line
@@ -68,7 +68,13 @@ class DataFeaturizer(object):
    df = self._standardize_df(pd.DataFrame(rows))
    for feature_type in feature_types:
      self._featurize_df(df, feature_type)
    print("featurize()")
    print("len(df)")
    print(len(df))
    print("out")
    print(out)
    save_to_disk(df, out)
    df_loaded = load_from_disk(out)

  def _get_fields(self, input_file):
    """Get the names of fields and field_types for input data."""
@@ -226,7 +232,7 @@ class FeaturizedSamples(object):
                  set(FeaturizedSamples.feature_types))
    return sorted(list(task_names))

  def __init__(self, feature_dir, dataset_files=None, overwrite=True):
  def __init__(self, feature_dir, dataset_files=None, overwrite=True, reload=False):
    """
    Initialiize FeaturizedSamples

@@ -241,14 +247,18 @@ class FeaturizedSamples(object):
    if not os.path.exists(feature_dir):
      os.makedirs(feature_dir)
    self.feature_dir = feature_dir
    
    if os.path.exists(self._get_compounds_filename()):
    print("FeaturizedSamples()")
    if os.path.exists(self._get_compounds_filename()) and reload:
      print("compounds loaded from disk")
      compounds_df = load_from_disk(self._get_compounds_filename())
    else:
      print("compounds recomputed")
      compounds_df = self._get_compounds()
      # compounds_df is not altered by any method after initialization, so it's
      # safe to keep a copy in memory and on disk.
      save_to_disk(compounds_df, self._get_compounds_filename())
    print("len(compounds_df)")
    print(len(compounds_df))
    self._check_validity(compounds_df)
    self.compounds_df = compounds_df