Commit fa5e9cce authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Save/load integrated into class.

parent 786a2ca9
Loading
Loading
Loading
Loading
+27 −90
Original line number Diff line number Diff line
@@ -9,48 +9,12 @@ import numpy as np
import pandas as pd
import joblib
import os
from keras.models import model_from_json
from deepchem.utils.dataset import NumpyDataset
from deepchem.utils.dataset import load_sharded_dataset
from deepchem.utils.dataset import save_sharded_dataset

'''
def get_parameter_filename(model_dir):
  """
  Given model directory, obtain filename for stored parameters.
  """
  filename = os.path.join(model_dir, "model_params.joblib")
  return filename
'''

# TODO(rbharath): Make these instance methods...
def save_sklearn_model(model, filename):
  """Saves sklearn model to disk using joblib."""
  joblib.dump(model, filename)

def save_keras_model(model, filename):
  """Saves keras models to disk."""
  filename, _ = os.path.splitext(filename)

  # Note that keras requires the model architecture and weights to be stored
  # separately. A json file is generated that specifies the model architecture.
  # The weights will be stored in an h5 file. The pkl.gz file with store the
  # target name.
  json_filename = "%s.%s" % (filename, "json")
  h5_filename = "%s.%s" % (filename, "h5")
  # Save architecture
  json_string = model.to_json()
  with open(json_filename, "wb") as file_obj:
    file_obj.write(json_string)
  model.save_weights(h5_filename, overwrite=True)


def get_model_filename(model_dir):
  """
  Given model directory, obtain filename for the model itself.
  """
  filename = os.path.join(model_dir, "model_params.joblib")
  return filename

# TODO(rbharath): Make a static method
def get_model_type(model_name):
@@ -66,27 +30,6 @@ def get_model_type(model_name):
    model_type = "sklearn"
  return model_type

# TODO(rbharath): Make this an instance method of Model objects.
def load_sklearn_model(filename):
  """Loads sklearn model from file on disk."""
  return joblib.load(filename)

def load_keras_model(filename):
  """Loads keras model from disk.

  Assumes that filename.json and filename.h5 respectively contain the model
  architecture and weights.
  """
  filename, _ = os.path.splitext(filename)

  json_filename = "%s.%s" % (filename, "json")
  h5_filename = "%s.%s" % (filename, "h5")

  with open(json_filename) as file_obj:
    model = model_from_json(file_obj.read())
  model.load_weights(h5_filename)
  return model

#TODO(enf/rbharath): incorporate save, load, eval, fit features into class Model.
class Model(object):
  """
@@ -94,7 +37,9 @@ class Model(object):
  """
  # List of registered models
  registered_model_types = {}
  def __init__(self, task_types, model_params, initialize_raw_model=True):
  def __init__(self, model_type, task_types, model_params,
               initialize_raw_model=True):
    self.model_type = model_type
    self.task_types = task_types
    self.model_params = model_params

@@ -124,6 +69,18 @@ class Model(object):
    """
    return(self.raw_model)

  def get_param_filename(self, out_dir):
    """
    Given model directory, obtain filename for the model itself.
    """
    return os.path.join(out_dir, "model_params.joblib")

  def get_model_filename(self, out_dir):
    """
    Given model directory, obtain filename for the model itself.
    """
    return os.path.join(out_dir, "model.joblib")

  @staticmethod
  def model_builder(model_type, task_types, model_params,
                    initialize_raw_model=True):
@@ -132,7 +89,7 @@ class Model(object):
    """
    if model_type in Model.registered_model_types:
      model = Model.registered_model_types[model_type](
          task_types, model_params, initialize_raw_model)
          model_type, task_types, model_params, initialize_raw_model)
    else:
      raise ValueError("model_type %s is not supported" % model_type)
    return model
@@ -144,40 +101,20 @@ class Model(object):
    """
    Model.registered_model_types[model_type] = model_class

  @staticmethod
  def load_model(model_name, model_dir):
  def load(self, model_dir):
    """Dispatcher function for loading."""
    model_type = get_model_type(model_name)
    params = load_sharded_dataset(get_model_filename(model_dir))
    model = Model.model_builder(model_name, params["task_types"],
                          params["model_params"], initialize_raw_model=False)
    if model_type == "sklearn":
      raw_model = load_sklearn_model(get_model_filename(model_dir))
    elif "keras" in model_type:
      raw_model = load_keras_model(get_model_filename(model_dir))
    else:
      raise ValueError("Unsupported model_type.")
    model.set_raw_model(raw_model)
    return model
    #model_type = get_model_type(model_name)
    params = load_sharded_dataset(self.get_model_filename(model_dir))
    self.model_params = params["model_params"]
    self.task_types = params["task_types"]
    self.model_type = params["model_type"]

  # TODO(rbharath): This really shouldn't be a static method. Make an instance
  # method instance.
  @staticmethod
  def save_model(model, model_name, model_dir):
  def save(self, out_dir):
    """Dispatcher function for saving."""
    model_type = get_model_type(model_name)
    params = {"model_params" : model.model_params,
              "task_types" : model.task_types}
    save_sharded_dataset(params, get_model_filename(model_dir))

    raw_model = model.get_raw_model()
    if model_type == "sklearn":
      save_sklearn_model(raw_model, get_model_filename(model_dir))
    elif "keras" in model_type:
      save_keras_model(raw_model, get_model_filename(model_dir))
    else:
      raise ValueError("Unsupported model_type.")

    params = {"model_params" : self.model_params,
              "task_types" : self.task_types,
              "model_type": self.model_type}
    save_sharded_dataset(params, self.get_params_filename(out_dir))

  def fit(self, numpy_dataset):
    """
+39 −1
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@ Code for processing the Google vs-datasets using keras.
"""
import numpy as np
from keras.models import Graph
from keras.models import model_from_json
from keras.layers.core import Dense, Dropout
from keras.optimizers import SGD
from deepchem.models import Model
@@ -12,7 +13,7 @@ class MultiTaskDNN(Model):
  Abstract base class for different ML models.
  """
  def __init__(self, task_types, model_params, initialize_raw_model=True):
    super(MultiTaskDNN, self).__init__(task_types, model_params,
    super(MultiTaskDNN, self).__init__(model_type, task_types, model_params,
                                       initialize_raw_model)
    if initialize_raw_model:
      sorted_tasks = sorted(task_types.keys())
@@ -116,6 +117,43 @@ class MultiTaskDNN(Model):
    y_pred = np.squeeze(y_pred)
    return y_pred

  def save(self, out_dir):
    """
    Saves underlying keras model to disk. 
    """
    super(MultiTaskDNN, self).save(out_dir)
    model = self.get_raw_model()
    filename, _ = os.path.splitext(self.get_model_filename(out_dir))

    # Note that keras requires the model architecture and weights to be stored
    # separately. A json file is generated that specifies the model architecture.
    # The weights will be stored in an h5 file. The pkl.gz file with store the
    # target name.
    json_filename = "%s.%s" % (filename, "json")
    h5_filename = "%s.%s" % (filename, "h5")
    # Save architecture
    json_string = model.to_json()
    with open(json_filename, "wb") as file_obj:
      file_obj.write(json_string)
    model.save_weights(h5_filename, overwrite=True)

  def load(self, model_dir):
    """
    Load keras multitask DNN from disk.
    """
    super(MultiTaskDNN, self).load(model_dir)
    filename = self.get_Model_filename(model_dir)
    filename, _ = os.path.splitext(filename)

    json_filename = "%s.%s" % (filename, "json")
    h5_filename = "%s.%s" % (filename, "h5")

    with open(json_filename) as file_obj:
      model = model_from_json(file_obj.read())
    model.load_weights(h5_filename)
    self.raw_model = model


Model.register_model_type("multitask_deep_regressor", MultiTaskDNN)
Model.register_model_type("multitask_deep_classifier", MultiTaskDNN)

+54 −7
Original line number Diff line number Diff line
@@ -16,24 +16,70 @@ class SklearnModel(Model):
  Abstract base class for different ML models.
  """
  def __init__(self, task_types, model_params, initialize_raw_model=True):
    super(SklearnModel, self).__init__(task_types, model_params,
                                       initialize_raw_model)
    self.task_types = task_types
    self.model_params = model_params
    self.raw_model = None
    if initialize_raw_model:
      if self.modeltype == "rf_regressor":
        raw_model = RandomForestRegressor(
            n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
      elif self.modeltype == "rf_classifier":
        raw_model = RandomForestClassifier(
            n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
      elif modeltype == "logistic":
        raw_model = LogisticRegression(class_weight="auto")
      elif modeltype == "linear":
        raw_model = LinearRegression(normalize=True)
      elif modeltype == "ridge":
        raw_model = RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0], normalize=True)
      elif modeltype == "lasso":
        raw_model = LassoCV(max_iter=2000, n_jobs=-1)
      elif modeltype == "lasso_lars":
        raw_model = LassoLarsCV(max_iter=2000, n_jobs=-1)
      elif modeltype == "elastic_net":
        raw_model = ElasticNetCV(max_iter=2000, n_jobs=-1)
      else:
        raise ValueError("Invalid model type provided.")

  def fit_on_batch(self, X, y, w):
  # TODO(rbharath): This is a partial implementation! Does not work for a
  # datasets with more than one shard. 
  def fit(self, numpy_dataset):
    """
    Updates existing model with new information.
    Fits SKLearn model to data.
    """
    raise NotImplementedError(
        "Each model is responsible for its own fit_on_batch method.")
    for (X, y, _, _) in numpy_dataset.itershards():
      self.raw_model.fit(X, y)
      return

  def predict_on_batch(self, X):
    """
    Makes predictions on given batch of new data.
    """
    raise NotImplementedError(
        "Each model is responsible for its own predict_on_batch method.")   
    return self.raw_model.predict(X)

  def save(self, out_dir):
    """Saves sklearn model to disk using joblib."""
    super(SklearnModel, self).save(out_dir)
    joblib.dump(self.raw_model, self.get_model_filename(out_dir))

  def load(self, model_dir):
    """Loads sklearn model from joblib file on disk."""
    super(SklearnModel, self).load(model_dir)
    self.raw_model = joblib.load(self.get_model_filename(model_dir)

Model.register_model_type("logistic", SklearnModel)
Model.register_model_type("rf_classifier", SklearnModel)
Model.register_model_type("rf_regressor", SklearnModel)
Model.register_model_type("linear", SklearnModel)
Model.register_model_type("ridge", SklearnModel)
Model.register_model_type("lasso", SklearnModel)
Model.register_model_type("lasso_lars", SklearnModel)
Model.register_model_type("elastic_net", SklearnModel)


# TODO(rbharath): Need to fix singletask dataset support.
'''
def fit_singletask_models(train_data, modeltype):
  """Fits singletask linear regression models to potency.

@@ -84,3 +130,4 @@ def fit_singletask_models(train_data, modeltype):
    model.fit(task_X_train, task_y_train.ravel())
    models[task] = model
  return models
'''