Commit 801dd394 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Continuing OO refactor.

parent c013e405
Loading
Loading
Loading
Loading
+25 −43
Original line number Diff line number Diff line
@@ -62,42 +62,48 @@ def add_featurize_group(featurize_cmd):
           "Will be created if does not exist")
  featurize_group.set_defaults(func=featurize_inputs_wrapper)

def add_train_test_command(subparsers):
  """Adds flags for train-test-split subcommand."""
  train_test_cmd = subparsers.add_parser(
      "train-test-split",
      help="Apply standard data transforms to raw features generated by featurize,\n"
           "then split data into train/test and store data as (X,y) matrices.")
  train_test_cmd.add_argument(
def add_transforms_group(cmd):
  """Adds flags for data transforms."""
  transform_group = cmd.add_argument_group("Transform Group")
  transform_group.add_argument(
      "--input-transforms", nargs="+", default=[],
      choices=["normalize-and-truncate"],
      choices=["normalize", "truncate", "log"],
      help="Transforms to apply to input data.")
  train_test_cmd.add_argument(
  transform_group.add_argument(
      "--output-transforms", nargs="+", default=[],
      choices=["normalize", "log"],
      help="Supported transforms are 'log' and 'normalize'. 'None' will be taken\n"
           "to mean no transforms are required.")
  train_test_cmd.add_argument(
  transform_group.add_argument(
      "--mode", default="singletask",
      choices=["singletask", "multitask"],
      help="Type of model being built.")
  transform_group.add_argument(
      "--feature-types", nargs="+", required=1,
      choices=["features", "fingerprints", "descriptors"],
      help="Featurizations of data to use.\n"
           "'features' denotes user-defined features.\n"
           "'fingerprints' denotes ECFP fingeprints.\n"
           "'descriptors' denotes RDKit chem descriptors.\n")
  train_test_cmd.add_argument(
      "--paths", nargs="+", required=1,
      help="Paths to input datasets.")
  train_test_cmd.add_argument(
  transform_group.add_argument(
      "--splittype", type=str, default="scaffold",
      choices=["scaffold", "random", "specified"],
      help="Type of train/test data-split. 'scaffold' uses Bemis-Murcko scaffolds.\n"
           "specified requires that split be in original data.")
  train_test_cmd.add_argument(
  transform_group.add_argument(
      "--weight-positives", type=bool, default=False,
      help="Weight positive examples to have same total weight as negatives.")

def add_train_test_command(subparsers):
  """Adds flags for train-test-split subcommand."""
  train_test_cmd = subparsers.add_parser(
      "train-test-split",
      help="Apply standard data transforms to raw features generated by featurize,\n"
           "then split data into train/test and store data as (X,y) matrices.")
  add_transform_group(train_test_cmd)
  train_test_cmd.add_argument(
      "--mode", default="singletask",
      choices=["singletask", "multitask"],
      help="Type of model being built.")
      "--paths", nargs="+", required=1,
      help="Paths to input datasets.")
  train_test_cmd.add_argument(
      "--data-dir", type=str, required=1,
      help="Location to save train and test data.")
@@ -210,31 +216,7 @@ def add_model_command(subparsers):
      help="The base directory for the model.")
  add_featurize_group(model_cmd)

  train_test_group = model_cmd.add_argument_group("train_test_group")
  train_test_group.add_argument(
      "--input-transforms", nargs="+", default=[],
      choices=["normalize-and-truncate"],
      help="Transforms to apply to input data.")
  train_test_group.add_argument(
      "--output-transforms", nargs="+", default=[],
      help="Supported transforms are log and normalize.")
  train_test_group.add_argument(
      "--mode", default="singletask",
      choices=["singletask", "multitask"],
      help="Type of model being built.")
  train_test_group.add_argument(
      "--feature-types", nargs="+", required=1,
      choices=["features", "fingerprints", "descriptors"],
      help="Featurizations of data to use.\n"
           "'features' denotes user-defined features.\n"
           "'fingerprints' denotes ECFP fingeprints.\n"
           "'descriptors' denotes RDKit chem descriptors.\n")
  train_test_group.add_argument(
      "--splittype", type=str, default="scaffold",
      choices=["scaffold", "random", "specified"],
      help="Type of train/test data-split. 'scaffold' uses Bemis-Murcko scaffolds.\n"
           "specified requires that split be in original data.")

  add_transform_group(model_cmd)
  add_model_group(model_cmd)
  model_cmd.set_defaults(func=create_model)

+6 −3
Original line number Diff line number Diff line
@@ -40,9 +40,12 @@ def compute_y_pred(model, data_dir, csv_out, split):
  """
  Computes model predictions on data and stores csv to disk.
  """
  metadata_filename = get_metadata_filename(data_dir)
  metadata_df = load_sharded_dataset(metadata_filename)
  task_names = metadata_df.iterrows().next()[1]['task_names']
  test_dir = os.path.join(data_dir, "test")
  test = NumpyDataset(test_dir)
  #metadata_filename = get_metadata_filename(data_dir)
  #metadata_df = load_sharded_dataset(metadata_filename)
  #task_names = metadata_df.iterrows().next()[1]['task_names']
  task_names = test.get_task_names()
  pred_task_names = ["%s_pred" % task_name for task_name in task_names]
  w_task_names = ["%s_weight" % task_name for task_name in task_names]
  column_names = (['ids'] + task_names + pred_task_names + w_task_names
+0 −3
Original line number Diff line number Diff line
@@ -33,9 +33,6 @@ def fit_model(model_name, model_params, model_dir, data_dir):

  model = Model.model_builder(model_name, task_types, model_params)

  print("model")
  print(model)

  train_metadata = metadata_df.loc[metadata_df['split'] =="train"]
  nb_batch = train_metadata.shape[0]
  # TODO(rbharath/enf): This is black magic. Needs to be removed/made more
+9 −103
Original line number Diff line number Diff line
@@ -10,9 +10,6 @@ from glob import glob
import pandas as pd
import os
import multiprocessing as mp
from deep_chem.utils.save import load_sharded_dataset
from deep_chem.utils.save import save_sharded_dataset
from functools import partial

__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2015, Stanford University"
@@ -28,13 +25,6 @@ def get_task_type(model_name):
  else:
    return "regression"

def get_metadata_filename(data_dir):
  """
  Get standard location for metadata file.
  """
  metadata_filename = os.path.join(data_dir, "metadata.joblib")
  return metadata_filename

def train_test_split(paths, input_transforms, output_transforms,
                     feature_types, splittype, mode, data_dir):
  """Saves transformed model."""
@@ -43,6 +33,15 @@ def train_test_split(paths, input_transforms, output_transforms,
  dataset = FeaturizedDataset(paths=paths)
  train_dataset, test_dataset = dataset.train_test_split(splittype)

  train_dir = os.path.join(data_dir, "train")
  train_arrays = train_dataset.to_arrays(train_dir, mode, feature_types)
  print("Transforming train data.")
  train_arrays.transform_data(input_transforms, output_transforms)

  test_dir = os.path.join(data_dir, "test")
  test_dataset.write(test_dir, mode, feature_types)
  print("Transforming test data.")
  test_arrays.transform_data(input_transforms, output_transforms)

'''
  print("About to train/test split dataset")
@@ -64,47 +63,6 @@ def train_test_split(paths, input_transforms, output_transforms,
  print("Saved metadata.")
'''

def transform_data(metadata_df, input_transforms, output_transforms):
  train_df = metadata_df.loc[metadata_df["split"] == "train"]
  test_df = metadata_df.loc[metadata_df["split"] == "test"]
  (normalize_X, truncate_x, normalize_y, 
      truncate_y, log_X, log_y) = False, False, False, False, False, False

  if "normalize-and-truncate" in input_transforms:
    normalize_X=True 
    truncate_x=True
  elif "normalize" in input_transforms:
    normalize_X=True

  if "normalize" in output_transforms:
    normalize_y=True

  if "log" in input_transforms:
    log_X = True 
  if "log" in output_transforms:
    log_y = True

  print("Transforming training data.")
  X_means, X_stds, y_means, y_stds = transform(train_df, normalize_X, 
                                               normalize_y, truncate_x,
                                               truncate_y, log_X, log_y)
  nrow = train_df.shape[0]
  train_df['X_means'] = [X_means for i in range(0,nrow)]
  train_df['X_stds'] = [X_stds for i in range(0,nrow)]
  train_df['y_means'] = [y_means for i in range(0,nrow)]
  train_df['y_stds'] = [y_stds for i in range(0,nrow)]

  print("Transforming test data.")
  X_means, X_stds, y_means, y_stds = transform(test_df, normalize_X, 
                                               normalize_y, truncate_x,
                                               truncate_y, log_X, log_y)
  nrow = test_df.shape[0]
  test_df['X_means'] = [X_means for i in range(0,nrow)]
  test_df['X_stds'] = [X_stds for i in range(0,nrow)]
  test_df['y_means'] = [y_means for i in range(0,nrow)]
  test_df['y_stds'] = [y_stds for i in range(0,nrow)]

  return(pd.concat([train_df, test_df]))

def undo_normalization(y, y_means, y_stds):
  """Undo the applied normalization transform."""
@@ -127,58 +85,6 @@ def undo_transform(y, y_means, y_stds, output_transforms):
  else:
    raise ValueError("Unsupported output transforms.")

def transform_row(i, df, normalize_X, normalize_y, truncate_X, truncate_y,
                      log_X, log_y, X_means, X_stds, y_means, y_stds, trunc):
  total = df.shape[0]
  row = df.iloc[i]
  X = load_sharded_dataset(row['X'])
  if normalize_X or log_X:
    if normalize_X:
      print("Normalizing X sample %d out of %d" % (i+1,total))
      X = np.nan_to_num((X - X_means) / X_stds)
      if truncate_X:
         print("Truncating X sample %d out of %d" % (i+1,total))
         X[X > trunc] = trunc
         X[X < (-1.0*trunc)] = -1.0 * trunc
    if log_X:
      X = np.log(X)
  save_sharded_dataset(X, row['X-transformed'])

  y = load_sharded_dataset(row['y'])
  if normalize_y or log_y:    
    if normalize_y:
      print("Normalizing y sample %d out of %d" % (i+1,total))
      y = np.nan_to_num((y - y_means) / y_stds)
      if truncate_y:
        y[y > trunc] = trunc
        y[y < (-1.0*trunc)] = -1.0 * trunc
    if log_y:
      y = np.log(y)
  save_sharded_dataset(y, row['y-transformed'])  

def transform(df, normalize_X=True, normalize_y=True, 
              truncate_X=True, truncate_y=True,
              log_X=False, log_y=False, parallel=False):
  trunc = 5.0
  X_means, X_stds, y_means, y_stds = compute_mean_and_std(df)
  total = df.shape[0]
  indices = range(0, df.shape[0])
  transform_row_partial = partial(transform_row, df=df, normalize_X=normalize_X, 
                                  normalize_y=normalize_y, truncate_X=truncate_X, 
                                  truncate_y=truncate_y, log_X=log_X,
                                 log_y=log_y, X_means=X_means, X_stds=X_stds,
                                 y_means=y_means, y_stds=y_stds, trunc=trunc)
  if parallel:
    pool = mp.Pool(int(mp.cpu_count()/4))
    pool.map(transform_row_partial, indices)
    pool.terminate()
  else:
    for index in indices:
      transform_row_partial(index)

  return X_means, X_stds, y_means, y_stds


#todo(enf/rbharath): this is completely broken.

'''
+42 −0
Original line number Diff line number Diff line
"""
Tests for dataset classes. 
"""
import os
import unittest
import numpy as np
import pandas as pd
import tempfile
import shutil
from deep_chem.utils.dataset import FeaturizedDataset
from deep_chem.utils.save import save_sharded_dataset

__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2015, Stanford University"
@@ -22,6 +26,14 @@ def featurize_compound(smiles, split=None):
          "fingerprints": np.zeros(10),
          "task": 1.0}

def featurized_dataset_from_data(data_df, out_dir):
  """
  Writes featurized data to disk and returns a FeaturizedData object.
  """
  data_loc = os.path.join(out_dir, "data.joblib")
  save_sharded_dataset(data_df, data_loc)
  return FeaturizedDataset(paths=[out_dir])

class TestFeaturizedDataset(unittest.TestCase):
  """
  Test FeaturizedDataset.
@@ -60,3 +72,33 @@ class TestFeaturizedDataset(unittest.TestCase):
    train, test = dataset.train_test_split(splittype="specified")
    assert len(train.compound_df) == .8 * len(self.compound_df)
    assert len(test.compound_df) == .2 * len(self.compound_df)

  def test_to_arrays(self):
    """
    Basic sanity test of to_arrays function.
    """
    dataset = FeaturizedDataset(compound_df=self.compound_df)
    # Test singletask mode writing runs
    dirpath = tempfile.mkdtemp()
    arrays = dataset.to_arrays(dirpath, "singletask", ["fingerprints"])
    shutil.rmtree(dirpath)

    # Test multitask mode writing runs
    dirpath = tempfile.mkdtemp()
    arrays = dataset.to_arrays(dirpath, "multitask", ["fingerprints"])
    shutil.rmtree(dirpath)

  def test_transform_data(self):
    """
    Basic sanity test of data transforms.
    """
    featurepath = tempfile.mkdtemp()
    dataset = featurized_dataset_from_data(self.compound_df, featurepath)
    # Test normalization transforms. 
    dirpath = tempfile.mkdtemp()
    arrays = dataset.to_arrays(dirpath, "singletask", ["fingerprints"])
    input_transforms = ["normalize"]
    output_transforms = ["normalize"]
    arrays.transform_data(input_transforms, output_transforms)
    shutil.rmtree(dirpath)
    shutil.rmtree(featurepath)