Commit 403bcfd1 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

More progress in OO. Debugging model stats.

parent 801dd394
Loading
Loading
Loading
Loading
+195 −1
Original line number Diff line number Diff line
"""
Contains an abstract base class that supports different ML models.
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals
import sys
import numpy as np
import pandas as pd
import joblib
import os
from keras.models import model_from_json
from deep_chem.utils.dataset import NumpyDataset
from deep_chem.utils.dataset import load_sharded_dataset
from deep_chem.utils.dataset import save_sharded_dataset

'''
def get_parameter_filename(model_dir):
  """
  Given model directory, obtain filename for stored parameters.
  """
  filename = os.path.join(model_dir, "model_params.joblib")
  return filename
'''

# TODO(rbharath): Make these instance methods...
def save_sklearn_model(model, filename):
  """Saves sklearn model to disk using joblib."""
  joblib.dump(model, filename)

def save_keras_model(model, filename):
  """Saves keras models to disk."""
  filename, _ = os.path.splitext(filename)

  # Note that keras requires the model architecture and weights to be stored
  # separately. A json file is generated that specifies the model architecture.
  # The weights will be stored in an h5 file. The pkl.gz file with store the
  # target name.
  json_filename = "%s.%s" % (filename, "json")
  h5_filename = "%s.%s" % (filename, "h5")
  # Save architecture
  json_string = model.to_json()
  with open(json_filename, "wb") as file_obj:
    file_obj.write(json_string)
  model.save_weights(h5_filename, overwrite=True)


def get_model_filename(model_dir):
  """
  Given model directory, obtain filename for the model itself.
  """
  filename = os.path.join(model_dir, "model_params.joblib")
  return filename

# TODO(rbharath): Make a static method
def get_model_type(model_name):
  """Associate each model with a model_type (used for saving/loading)."""
  if model_name in ["singletask_deep_classifier", "multitask_deep_classifier",
                    "singletask_deep_regressor", "multitask_deep_regressor"]:
    model_type = "keras-graph"
  elif model_name in ["convolutional_3D_regressor"]:
    model_type = "keras-sequential"
  elif model_name == "neural_fingerprint":
    model_type = "autograd"
  else:
    model_type = "sklearn"
  return model_type

# TODO(rbharath): Make this an instance method of Model objects.
def load_sklearn_model(filename):
  """Loads sklearn model from file on disk."""
  return joblib.load(filename)

def load_keras_model(filename):
  """Loads keras model from disk.

  Assumes that filename.json and filename.h5 respectively contain the model
  architecture and weights.
  """
  filename, _ = os.path.splitext(filename)

  json_filename = "%s.%s" % (filename, "json")
  h5_filename = "%s.%s" % (filename, "h5")

  with open(json_filename) as file_obj:
    model = model_from_json(file_obj.read())
  model.load_weights(h5_filename)
  return model

#TODO(enf/rbharath): incorporate save, load, eval, fit features into class Model.
class Model(object):
@@ -46,6 +127,9 @@ class Model(object):
  @staticmethod
  def model_builder(model_type, task_types, model_params,
                    initialize_raw_model=True):
    """
    Factory method that initializes model of requested type.
    """
    if model_type in Model.registered_model_types:
      model = Model.registered_model_types[model_type](
          task_types, model_params, initialize_raw_model)
@@ -55,8 +139,118 @@ class Model(object):

  @staticmethod
  def register_model_type(model_type, model_class):
    """
    Registers model types in static variable for factory/dispatchers to use.
    """
    Model.registered_model_types[model_type] = model_class

  @staticmethod
  def load_model(model_name, model_dir):
    """Dispatcher function for loading."""
    model_type = get_model_type(model_name)
    params = load_sharded_dataset(get_model_filename(model_dir))
    model = Model.model_builder(model_name, params["task_types"],
                          params["model_params"], initialize_raw_model=False)
    if model_type == "sklearn":
      raw_model = load_sklearn_model(get_model_filename(model_dir))
    elif "keras" in model_type:
      raw_model = load_keras_model(get_model_filename(model_dir))
    else:
      raise ValueError("Unsupported model_type.")
    model.set_raw_model(raw_model)
    return model

  # TODO(rbharath): This really shouldn't be a static method. Make an instance
  # method instance.
  @staticmethod
  def save_model(model, model_name, model_dir):
    """Dispatcher function for saving."""
    model_type = get_model_type(model_name)
    params = {"model_params" : model.model_params,
              "task_types" : model.task_types}
    save_sharded_dataset(params, get_model_filename(model_dir))

    raw_model = model.get_raw_model()
    if model_type == "sklearn":
      save_sklearn_model(raw_model, get_model_filename(model_dir))
    elif "keras" in model_type:
      save_keras_model(raw_model, get_model_filename(model_dir))
    else:
      raise ValueError("Unsupported model_type.")


  def fit(self, numpy_dataset):
    """
    Fits a model on data in a NumpyDataset object.
    """
    # TODO(rbharath/enf): This GPU_RAM is black magic. Needs to be removed/made
    # more general.
    MAX_GPU_RAM = float(691007488/50)
    for (X, y, w, _) in numpy_dataset.itershards():
      if sys.getsizeof(X) > MAX_GPU_RAM:
        nb_block = float(sys.getsizeof(X))/MAX_GPU_RAM
        nb_sample = np.shape(X)[0]
        interval_points = np.linspace(0,nb_sample,nb_block+1).astype(int)
        for j in range(0,len(interval_points)-1):
          indices = range(interval_points[j],interval_points[j+1])
          X_batch = X[indices,:]
          y_batch = y[indices]
          w_batch = w[indices]
          self.fit_on_batch(X_batch, y_batch, w_batch)
      else:
        self.fit_on_batch(X, y, w)

  # TODO(rbharath): What does this function do when y is not provided. Suspect
  # it breaks. Need to fix.

  # TODO(rbharath): The structure of the produced df might be
  # complicated. Better way to model?
  def predict(self, numpy_dataset):
    """
    Uses self to make predictions on provided NumpyDataset object.
    """
    task_names = numpy_dataset.get_task_names()
    pred_task_names = ["%s_pred" % task_name for task_name in task_names]
    w_task_names = ["%s_weight" % task_name for task_name in task_names]
    column_names = (['ids'] + task_names + pred_task_names + w_task_names
                           + ["y_means", "y_stds"])
    pred_y_df = pd.DataFrame(columns=column_names)

    # TODO(rbharath/enf): This is only for GPU models, and is currently depends
    # on magic numbers.
    MAX_GPU_RAM = float(691007488/50)
    for (X, y, w, ids) in numpy_dataset.itershards():
      if sys.getsizeof(X) > MAX_GPU_RAM:
        nb_block = float(sys.getsizeof(X))/MAX_GPU_RAM
        nb_sample = np.shape(X)[0]
        interval_points = np.linspace(0,nb_sample,nb_block+1).astype(int)
        y_preds = []
        for j in range(0,len(interval_points)-1):
          indices = range(interval_points[j],interval_points[j+1])
          X_batch = X[indices,:]
          y_batch = y[indices]
          w_batch = w[indices]
          y_preds.append(self.predict_on_batch(X_batch))
        y_pred = np.concatenate(y_preds)
      else:
        y_pred = self.predict_on_batch(X)
      print("model.predict()")
      print("np.shape(y)")
      print(np.shape(y))
      print("np.shape(y_pred)")
      print(np.shape(y_pred))
      y_pred = np.reshape(y_pred, np.shape(y))

      shard_df = pd.DataFrame(columns=column_names)
      shard_df['ids'] = ids
      shard_df[task_names] = y
      shard_df[pred_task_names] = y_pred
      shard_df[w_task_names] = w
      shard_df["y_means"] = numpy_dataset.get_label_means() 
      shard_df["y_stds"] = numpy_dataset.get_label_stds() 
      pred_y_df = pd.concat([pred_y_df, shard_df])

    return pred_y_df 

'''
def model_predictions(X, model, n_targets, task_types, modeltype="sklearn"):
+28 −3
Original line number Diff line number Diff line
@@ -55,8 +55,11 @@ class MultiTaskDNN(Model):
  def get_data_dict(self, X, y=None):
    data = {}
    data["input"] = X
    print("get_data_dict()")
    print("self.task_types.keys()")
    print(self.task_types.keys())
    for ind, task in enumerate(sorted(self.task_types.keys())):
      task_type, taskname = task_types[task], "task%d" % ind
      task_type, taskname = self.task_types[task], "task%d" % ind
      if y is not None:
        if task_type == "classification":
          data[taskname] = to_one_hot(y[:, ind])
@@ -77,7 +80,7 @@ class MultiTaskDNN(Model):
    """
    eps = .001
    # Add eps weight to avoid minibatches with zero weight (causes theano to crash).
    W = W + eps * np.ones(np.shape(W))
    w = w + eps * np.ones(np.shape(w))
    data = self.get_data_dict(X, y)
    sample_weight = self.get_sample_weight(w)
    loss = self.raw_model.train_on_batch(data, sample_weight=sample_weight)
@@ -86,8 +89,30 @@ class MultiTaskDNN(Model):
    """
    Makes predictions on given batch of new data.
    """
    #print("deep.predict_on_batch()")
    #print("np.shape(X)")
    #print(np.shape(X))
    #print("type(self.raw_model)")
    #print(type(self.raw_model))
    data = self.get_data_dict(X)
    y_pred = self.raw_model.predict_on_batch(data)
    #print("data")
    #print(data)
    y_pred_dict = self.raw_model.predict_on_batch(data)
    sorted_tasks = sorted(self.task_types.keys())
    nb_samples = np.shape(X)[0]
    nb_tasks = len(sorted_tasks)
    y_pred = np.zeros((nb_samples, nb_tasks))
    for ind, task in enumerate(sorted_tasks):
      taskname = "task%d" % ind
      y_pred[:,ind] = np.squeeze(y_pred_dict[taskname])
    #print("np.shape(y_pred)")
    #print(np.shape(y_pred))
    #print("type(self.raw_model.predict(data))")
    #print(type(self.raw_model.predict(data)))
    #print("self.raw_model.predict(data).keys()")
    #print(self.raw_model.predict(data).keys())
    #print("np.shape(self.raw_model.predict(data))")
    #print(np.shape(self.raw_model.predict(data)))
    y_pred = np.squeeze(y_pred)
    return y_pred

+9 −4
Original line number Diff line number Diff line
@@ -100,7 +100,7 @@ def add_train_test_command(subparsers):
      "train-test-split",
      help="Apply standard data transforms to raw features generated by featurize,\n"
           "then split data into train/test and store data as (X,y) matrices.")
  add_transform_group(train_test_cmd)
  add_transforms_group(train_test_cmd)
  train_test_cmd.add_argument(
      "--paths", nargs="+", required=1,
      help="Paths to input datasets.")
@@ -216,7 +216,7 @@ def add_model_command(subparsers):
      help="The base directory for the model.")
  add_featurize_group(model_cmd)

  add_transform_group(model_cmd)
  add_transforms_group(model_cmd)
  add_model_group(model_cmd)
  model_cmd.set_defaults(func=create_model)

@@ -276,13 +276,18 @@ def create_model(args):
  stats_out_train = os.path.join(data_dir, "train-stats.txt")
  csv_out_test = os.path.join(data_dir, "test.csv")
  stats_out_test = os.path.join(data_dir, "test-stats.txt")
  print("create_model()")
  print("args.output_transforms")
  print(args.output_transforms)
  train_dir = os.path.join(data_dir, "train")
  eval_trained_model(
      model_name, model_dir, data_dir, csv_out_train,
      model_name, model_dir, train_dir, csv_out_train,
      stats_out_train, args.output_transforms, split="train")
  print("Eval Model on Test")
  print("------------------")
  test_dir = os.path.join(data_dir, "test")
  eval_trained_model(
      model_name, model_dir, data_dir, csv_out_test,
      model_name, model_dir, test_dir, csv_out_test,
      stats_out_test, args.output_transforms, split="test")

def parse_args(input_args=None):
+443 −0

File added.

Preview size limit exceeded, changes collapsed.

+43 −30
Original line number Diff line number Diff line
@@ -5,15 +5,16 @@ from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import warnings
#from deep_chem.utils.preprocess import undo_transform_outputs
from deep_chem.utils.preprocess import get_metadata_filename
from deep_chem.utils.preprocess import get_sorted_task_names
#from deep_chem.utils.preprocess import get_metadata_filename
from deep_chem.utils.dataset import NumpyDataset
from deep_chem.utils.preprocess import get_task_type
from deep_chem.utils.preprocess import undo_transform
from deep_chem.utils.save import load_model
from deep_chem.utils.save import load_sharded_dataset
from deep_chem.utils.dataset import load_sharded_dataset
from deep_chem.models import Model 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
@@ -30,7 +31,10 @@ __license__ = "LGPL"
def eval_trained_model(model_name, model_dir, data_dir,
                       csv_out, stats_out, output_transforms, split="test"):
  """Evaluates a trained model on specified data."""
  model = load_model(model_name, model_dir)
  print("eval_trained_model()")
  print("output_transforms")
  print(output_transforms)
  model = Model.load_model(model_name, model_dir)
  task_type = get_task_type(model_name)
  task_names, pred_y_df = compute_y_pred(model, data_dir, csv_out, split)
  compute_model_performance(pred_y_df, task_names, 
@@ -40,34 +44,36 @@ def compute_y_pred(model, data_dir, csv_out, split):
  """
  Computes model predictions on data and stores csv to disk.
  """
  test_dir = os.path.join(data_dir, "test")
  test = NumpyDataset(test_dir)
  test = NumpyDataset(data_dir)
  task_names = test.get_task_names()
  #metadata_filename = get_metadata_filename(data_dir)
  #metadata_df = load_sharded_dataset(metadata_filename)
  #task_names = metadata_df.iterrows().next()[1]['task_names']
  task_names = test.get_task_names()
  pred_task_names = ["%s_pred" % task_name for task_name in task_names]
  w_task_names = ["%s_weight" % task_name for task_name in task_names]
  column_names = (['ids'] + task_names + pred_task_names + w_task_names
                         + ["y_means", "y_stds"])
  pred_y_df = pd.DataFrame(columns=column_names)

  split_df = metadata_df.loc[metadata_df['split'] == split]
  nb_batch = split_df.shape[0]
  print("compute_y_pred()")
  print("split_df.shape")
  print(split_df.shape)
  # TODO(rbharath/enf): This is only for GPU models, and is currently depends
  # on magic numbers.
  MAX_GPU_RAM = float(691007488/50)

  pred_y_df = model.predict(test)
  #split_df = metadata_df.loc[metadata_df['split'] == split]
  #nb_batch = split_df.shape[0]
  #print("compute_y_pred()")
  #print("split_df.shape")
  #print(split_df.shape)

  print("Saving predictions to %s" % csv_out)
  pred_y_df.to_csv(csv_out)
  print("Saved.")

  return task_names, pred_y_df

  '''
  for i, row in split_df.iterrows():
    print("Evaluating on %s batch %d out of %d" % (split, i+1, nb_batch))
    X = load_sharded_dataset(row['X-transformed'])
    y = load_sharded_dataset(row['y-transformed'])
    w = load_sharded_dataset(row['w'])
    ids = load_sharded_dataset(row['ids'])

  '''
  '''
  MAX_GPU_RAM = float(691007488/50)
  for (X, y, w, ids) in test.itershards():
    if sys.getsizeof(X) > MAX_GPU_RAM:
      nb_block = float(sys.getsizeof(X))/MAX_GPU_RAM
      nb_sample = np.shape(X)[0]
@@ -82,7 +88,6 @@ def compute_y_pred(model, data_dir, csv_out, split):
      y_pred = np.concatenate(y_preds)
    else:
      y_pred = model.predict_on_batch(X)

    y_pred = np.reshape(y_pred, np.shape(y))

    mini_df = pd.DataFrame(columns=column_names)
@@ -93,12 +98,7 @@ def compute_y_pred(model, data_dir, csv_out, split):
    mini_df["y_means"] = split_df["y_means"]
    mini_df["y_stds"] = split_df["y_stds"]
    pred_y_df = pd.concat([pred_y_df, mini_df])

  print("Saving predictions to %s" % csv_out)
  pred_y_df.to_csv(csv_out)
  print("Saved.")

  return task_names, pred_y_df
  '''

def compute_model_performance(pred_y_df, task_names, task_type, stats_file, output_transforms):
  """
@@ -113,15 +113,24 @@ def compute_model_performance(pred_y_df, task_names, task_type, stats_file, outp

  performance_df = pd.DataFrame(columns=colnames)
  print("compute_model_performance()")
  print("output_transforms")
  print(output_transforms)
  print("pred_y_df")
  print(pred_y_df)
  y_means = pred_y_df.iterrows().next()[1]["y_means"]
  y_stds = pred_y_df.iterrows().next()[1]["y_stds"]

  for i, task_name in enumerate(task_names):
    print("task_name")
    print(task_name)
    y = pred_y_df[task_name]
    y_pred = pred_y_df["%s_pred" % task_name]
    w = pred_y_df["%s_weight" % task_name]
    print("pre-transform")
    print("y")
    print(y)
    print("y_pred")
    print(y_pred)

    y = undo_transform(y, y_means, y_stds, output_transforms)
    y_pred = undo_transform(y_pred, y_means, y_stds, output_transforms)
@@ -135,6 +144,10 @@ def compute_model_performance(pred_y_df, task_names, task_type, stats_file, outp
      performance_df.loc[i] = [task_name, auc, mcc, recall, accuracy]

    elif task_type == "regression":
      print("y")
      print(y)
      print("y_pred")
      print(y_pred)
      r2s = r2_score(y, y_pred)
      rms = np.sqrt(mean_squared_error(y, y_pred))
      performance_df.loc[i] = [task_name, r2s, rms]
Loading