Commit 00eeac13 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Undoing transforms is partially debugged.

parent 6d44182d
Loading
Loading
Loading
Loading
+15 −6
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@ from deep_chem.utils.featurize import generate_vs_utils_features
from deep_chem.models.standard import fit_singletask_models
from deep_chem.utils.load import get_target_names
from deep_chem.utils.load import process_datasets
from deep_chem.utils.load import transform_data
from deep_chem.utils.evaluate import results_to_csv
from deep_chem.utils.save import save_model
from deep_chem.utils.save import load_model
@@ -195,10 +196,15 @@ def train_test_input(args):
      args.input_transforms, output_transforms, feature_types=args.feature_types, 
      splittype=args.splittype, weight_positives=args.weight_positives,
      mode=args.mode)
  trans_train_dict = transform_data(train_dict, input_transforms, output_transforms)
  trans_test_dict = transform_data(test_dict, input_transforms, output_transforms)
  transforms = {"input_transforms": input_transforms,
                "output_transform": output_transforms}
  print "train_dict()"
  print train_dict
  trans_train_dict = transform_data(train_dict, args.input_transforms,
      args.output_transforms)
  trans_test_dict = transform_data(test_dict, args.input_transforms, args.output_transforms)
  print "train_dict()"
  print train_dict
  transforms = {"input_transforms": args.input_transforms,
                "output_transform": args.output_transforms}
  stored_train = {"raw": train_dict, "transformed": trans_train_dict, "transforms": transforms}
  stored_test = {"raw": test_dict, "transformed": trans_test_dict, "transforms": transforms}
  with gzip.open(args.train_out, "wb") as f:
@@ -248,9 +254,12 @@ def eval_trained_model(args):
  with gzip.open(args.saved_data) as f:
    stored_test = pickle.load(f)
  test_dict = stored_test["transformed"]
  raw_test_dict = stored_test["raw"]
  output_transforms = stored_test["transforms"]["output_transform"]

  results, aucs, r2s, rms = compute_model_performance(test_dict, task_types, model, args.modeltype,
    args.compute_aucs, args.compute_r2s, args.compute_rms) 
  results, aucs, r2s, rms = compute_model_performance(raw_test_dict, test_dict,
      task_types, model, args.modeltype, output_transforms, args.compute_aucs,
      args.compute_r2s, args.compute_rms) 
  if args.csv_out is not None:
    results_to_csv(results, args.csv_out, task_type=args.task_type)

+23 −8
Original line number Diff line number Diff line
@@ -10,23 +10,29 @@ import numpy as np
import warnings
from deep_chem.utils.preprocess import dataset_to_numpy
from deep_chem.utils.preprocess import labels_to_weights
from deep_chem.utils.preprocess import undo_transform_outputs
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
from rdkit import Chem
from rdkit.Chem.Descriptors import ExactMolWt

def compute_model_performance(test_data, task_types, models, modeltype,
    aucs=True, r2s=False, rms=False):
def compute_model_performance(raw_test_data, test_data, task_types, models, modeltype,
    output_transforms, aucs=True, r2s=False, rms=False):
  """Computes statistics for model performance on test set."""
  all_results, auc_vals, r2_vals, rms_vals = {}, {}, {}, {}
  for index, target in enumerate(sorted(test_data.keys())):
    print "Evaluating model %d" % index
    print "Target %s" % target
    (test_ids, Xtest, ytest, wtest) = test_data[target]
    (_, _, ytest_raw, _) = raw_test_data[target]
    print "ytest"
    print ytest
    print "ytest_raw"
    print ytest_raw
    model = models[target]
    results = eval_model(test_ids, Xtest, ytest, wtest, model, {target: task_types[target]}, 
                         modeltype=modeltype)
    results = eval_model(test_ids, Xtest, ytest, ytest_raw, wtest, model, {target: task_types[target]}, 
                         modeltype=modeltype, output_transforms=output_transforms)
    #print results
    all_results[target] = results[target]
    if aucs:
@@ -94,7 +100,7 @@ def model_predictions(X, model, n_targets, task_types, modeltype="sklearn"):
  ypreds = np.reshape(ypreds, (len(ypreds), n_targets))
  return ypreds

def eval_model(ids, X, Ytrue, W, model, task_types, modeltype="sklearn"):
def eval_model(ids, X, Ytrue, Ytrue_raw, W, model, task_types, output_transforms, modeltype="sklearn"):
  """Evaluates the provided model on the test-set.

  Returns a dict which maps target-names to pairs of np.ndarrays (ytrue,
@@ -119,8 +125,19 @@ def eval_model(ids, X, Ytrue, W, model, task_types, modeltype="sklearn"):
  ypreds = model_predictions(X, model, len(task_types),
      task_types, modeltype=modeltype)
  results = {}
  print "eval_model()"
  print "Ytrue"
  print Ytrue
  print "Ytrue_raw"
  print Ytrue_raw
  for target_ind, target in enumerate(sorted_targets):
    ytrue, ypred = Ytrue[:, target_ind], ypreds[:, target_ind]
    ytrue_raw, ytrue, ypred = Ytrue_raw[:, target_ind], Ytrue[:, target_ind], ypreds[:, target_ind]
    #ypred = undo_transform_outputs(ytrue, ypred, output_transforms)
    ytrue_trans = undo_transform_outputs(ytrue_raw, ytrue, output_transforms)
    print "ytrue_raw"
    print ytrue_raw
    print "ytrue_trans"
    print ytrue_trans
    results[target] = (ids, np.squeeze(ytrue), np.squeeze(ypred))
  return results

@@ -147,8 +164,6 @@ def compute_roc_auc_scores(results, task_types):
  Parameters
  ----------
  results: dict
    A dictionary of type produced by eval_model which maps target-names to
    pairs of lists (ytrue, yscore).
  task_types: dict 
    dict mapping target names to output type. Each output type must be either
    "classification" or "regression".
+15 −12
Original line number Diff line number Diff line
@@ -56,14 +56,14 @@ def process_datasets(paths, input_transforms, output_transforms,
  print np.shape(train_dict['CANVAS-BACE'][1])
  return train_dict, test_dict 

def transform_data(data_dict, input_transforms, output_transforms):
  """Transforms data using specified transforms"""
  trans_dict = {}
  for target in data_dict:
    data = data_dict[target]
    trans_data = transform_data(data, input_transforms, output_transforms)
    trans_dict[target] = trans_data
  return trans_dict
#def transform_data(data_dict, input_transforms, output_transforms):
#  """Transforms data using specified transforms"""
#  trans_dict = {}
#  for target in data_dict:
#    data = data_dict[target]
#    trans_data = transform_data(data, input_transforms, output_transforms)
#    trans_dict[target] = trans_data
#  return trans_dict

def load_molecules(paths, feature_types=["fingerprints"]):
  """Load dataset fingerprints and return fingerprints.
@@ -215,7 +215,10 @@ def transform_data(data, input_transforms, output_transforms):
    are performed in the order specified. An empty list corresponds to no
    transformations. Only for regression outputs.
  """
  ids, X, y, W = data 
  trans_dict = {}
  for target in data:
    ids, X, y, W = data[target]
    y = transform_outputs(y, W, output_transforms)
    X = transform_inputs(X, input_transforms)
  return (ids, X, y, W)
    trans_dict[target] = (ids, X, y, W)
  return trans_dict
+33 −13
Original line number Diff line number Diff line
@@ -17,6 +17,8 @@ def to_arrays(train, test):

def transform_inputs(X, input_transforms):
  """Transform the input feature data."""
  # Copy X up front to have non-destructive updates.
  X = np.copy(X)
  if len(np.shape(X)) == 2:
    (n_samples, n_features) = np.shape(X)
  else:
@@ -42,6 +44,29 @@ def transform_inputs(X, input_transforms):
    Z[:, feature] = feature_data
  return Z


def undo_normalization(y_orig, y_pred):
  """Undo the applied normalization transform."""
  old_mean = np.mean(y_orig)
  old_std = np.std(y_orig)
  return y_orig * old_std + old_mean

def undo_transform_outputs(y_raw, y_pred, output_transforms):
  """Undo transforms on y_pred, W_pred."""
  print "undo_transform_outputs()"
  print "output_transforms"
  print output_transforms
  print "y_raw"
  print y_raw
  if output_transforms == ["log"]:
    return np.exp(y_pred)
  elif output_transforms == ["normalize"]:
    return undo_normalization(y_raw, y_pred)
  elif output_transforms == ["log", "normalize"]:
    return np.exp(undo_normalization(np.log(y_raw), y_pred))
  else:
    raise ValueError("Unsupported output transforms.")

def transform_outputs(y, W, output_transforms):
  """Tranform the provided outputs

@@ -51,26 +76,21 @@ def transform_outputs(y, W, output_transforms):
    Labels
  W: ndarray
    Weights 
  output_transforms: dict 
    dict mapping target names to list of label transforms. Each list
    element must be "1+max-val", "log", "normalize". The transformations are
    performed in the order specified. An empty list
  output_transforms: list 
    List of specified transforms (must be "log", "normalize"). The
    transformations are performed in the order specified. An empty list
    corresponds to no transformations. Only for regression outputs.
  """
  sorted_targets = sorted(output_transforms.keys())
  endpoints = sorted_targets
  transforms = output_transforms.copy()
  for task, target in enumerate(endpoints):
    output_transforms = transforms[target]
  # Copy y up front so we have non-destructive updates
  y = np.copy(y)
  (_, n_targets) = np.shape(y)
  for task in range(n_targets):
    for output_transform in output_transforms:
      if output_transform == "log":
        y[:, task] = np.log(y[:, task])
      elif output_transform == "1+max-val":
        maxval = np.amax(y[:, task])
        y[:, task] = 1 + maxval - y[:, task]
      elif output_transform == "normalize":
        task_data = y[:, task]
        if task < len(sorted_targets):
        if task < n_targets:
          # Only elements of y with nonzero weight in W are true labels.
          nonzero = (W[:, task] != 0)
        else:
+1 −1
Original line number Diff line number Diff line
@@ -51,7 +51,7 @@ def save_keras_model(models, filename):
    json_string = model.to_json()
    with open(json_filename, "wb") as f:
      f.write(json_string)
    model.save_weights(h5_filename)
    model.save_weights(h5_filename, overwrite=True)

def load_keras_model(filename):
  """Loads keras model from disk.