Commit f0f841be authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Eval bug now looks fixed. Need to fix fit() to handle minibatches.

parent 92d262ee
Loading
Loading
Loading
Loading
+9 −7
Original line number Original line Diff line number Diff line
@@ -118,7 +118,7 @@ class Model(object):
    save_to_disk(params, Model.get_params_filename(out_dir))
    save_to_disk(params, Model.get_params_filename(out_dir))


  # TODO(rbharath): This training is currently broken w.r.t minibatches! Fix.
  # TODO(rbharath): This training is currently broken w.r.t minibatches! Fix.
  def fit(self, sharded_dataset):
  def fit(self, dataset):
    """
    """
    Fits a model on data in a Dataset object.
    Fits a model on data in a Dataset object.
    """
    """
@@ -127,7 +127,7 @@ class Model(object):
    MAX_GPU_RAM = float(691007488/50)
    MAX_GPU_RAM = float(691007488/50)
    for epoch in range(self.model_params["nb_epoch"]):
    for epoch in range(self.model_params["nb_epoch"]):
      print("Starting epoch %s" % str(epoch+1))
      print("Starting epoch %s" % str(epoch+1))
      for i, (X, y, w, _) in enumerate(sharded_dataset.itershards()):
      for i, (X, y, w, _) in enumerate(dataset.itershards()):
        print("Training on batch-%s/epoch-%s" % (str(i+1), str(epoch+1)))
        print("Training on batch-%s/epoch-%s" % (str(i+1), str(epoch+1)))
        if sys.getsizeof(X) > MAX_GPU_RAM:
        if sys.getsizeof(X) > MAX_GPU_RAM:
          nb_block = float(sys.getsizeof(X))/MAX_GPU_RAM
          nb_block = float(sys.getsizeof(X))/MAX_GPU_RAM
@@ -147,11 +147,11 @@ class Model(object):


  # TODO(rbharath): The structure of the produced df might be
  # TODO(rbharath): The structure of the produced df might be
  # complicated. Better way to model?
  # complicated. Better way to model?
  def predict(self, sharded_dataset):
  def predict(self, dataset):
    """
    """
    Uses self to make predictions on provided Dataset object.
    Uses self to make predictions on provided Dataset object.
    """
    """
    task_names = sharded_dataset.get_task_names()
    task_names = dataset.get_task_names()
    pred_task_names = ["%s_pred" % task_name for task_name in task_names]
    pred_task_names = ["%s_pred" % task_name for task_name in task_names]
    w_task_names = ["%s_weight" % task_name for task_name in task_names]
    w_task_names = ["%s_weight" % task_name for task_name in task_names]
    column_names = (['ids'] + task_names + pred_task_names + w_task_names
    column_names = (['ids'] + task_names + pred_task_names + w_task_names
@@ -161,7 +161,7 @@ class Model(object):
    # TODO(rbharath/enf): This is only for GPU models, and is currently depends
    # TODO(rbharath/enf): This is only for GPU models, and is currently depends
    # on magic numbers.
    # on magic numbers.
    MAX_GPU_RAM = float(691007488/50)
    MAX_GPU_RAM = float(691007488/50)
    for (X, y, w, ids) in sharded_dataset.itershards():
    for (X, y, w, ids) in dataset.itershards():
      if sys.getsizeof(X) > MAX_GPU_RAM:
      if sys.getsizeof(X) > MAX_GPU_RAM:
        nb_block = float(sys.getsizeof(X))/MAX_GPU_RAM
        nb_block = float(sys.getsizeof(X))/MAX_GPU_RAM
        nb_sample = np.shape(X)[0]
        nb_sample = np.shape(X)[0]
@@ -180,8 +180,10 @@ class Model(object):
      shard_df[task_names] = y
      shard_df[task_names] = y
      shard_df[pred_task_names] = y_pred
      shard_df[pred_task_names] = y_pred
      shard_df[w_task_names] = w
      shard_df[w_task_names] = w
      shard_df["y_means"] = sharded_dataset.get_label_means() 
      # TODO(rbharath): This feels like a total hack. Is there a structured way
      shard_df["y_stds"] = sharded_dataset.get_label_stds() 
      # to deal with this instead?
      shard_df["y_means"] = list(dataset.get_label_means())[0] * np.ones(np.shape(y))
      shard_df["y_stds"] = list(dataset.get_label_stds())[0]  * np.ones(np.shape(y))
      pred_y_df = pd.concat([pred_y_df, shard_df])
      pred_y_df = pd.concat([pred_y_df, shard_df])


    return pred_y_df 
    return pred_y_df 
+13 −17
Original line number Original line Diff line number Diff line
@@ -18,10 +18,6 @@ import deepchem.models.deep
import deepchem.models.standard
import deepchem.models.standard
import deepchem.models.deep3d
import deepchem.models.deep3d


# TODO(rbharath): Are any commands except for create_model actually used? Due to
# the --skip-foo flags, it's possible to run all functionality directly through
# create_model. Perhaps trim the fat and delete the remaining commands.

def add_featurize_group(featurize_cmd):
def add_featurize_group(featurize_cmd):
  """Adds flags for featurizization."""
  """Adds flags for featurizization."""
  featurize_group = featurize_cmd.add_argument_group("Input Specifications")
  featurize_group = featurize_cmd.add_argument_group("Input Specifications")
@@ -138,17 +134,17 @@ def add_model_command(subparsers):
      "model", help="Combines featurize, train-test-split, fit, eval into one\n"
      "model", help="Combines featurize, train-test-split, fit, eval into one\n"
      "command for user convenience.")
      "command for user convenience.")
  model_cmd.add_argument(
  model_cmd.add_argument(
      "--skip-featurization", action="store_true",
      "--featurize", action="store_true",
      help="If set, skip the featurization step.")
      help="Perform the featurization step.")
  model_cmd.add_argument(
  model_cmd.add_argument(
      "--skip-train-test-split", action="store_true",
      "--train-test-split", action="store_true",
      help="If set, skip the train-test-split step.")
      help="Perform the train-test-split step.")
  model_cmd.add_argument(
  model_cmd.add_argument(
      "--skip-fit", action="store_true",
      "--fit", action="store_true",
      help="If set, skip model fit step.")
      help="Perform model fit step.")
  model_cmd.add_argument(
  model_cmd.add_argument(
      "--skip-eval", action="store_true",
      "--eval", action="store_true",
      help="If set, skip model eval step.")
      help="Perform model eval step.")
  model_cmd.add_argument(
  model_cmd.add_argument(
      "--base-dir", type=str, required=1,
      "--base-dir", type=str, required=1,
      help="The base directory for the model.")
      help="The base directory for the model.")
@@ -186,7 +182,7 @@ def create_model(args):


  print("+++++++++++++++++++++++++++++++++")
  print("+++++++++++++++++++++++++++++++++")
  print("Perform featurization")
  print("Perform featurization")
  if not args.skip_featurization:
  if args.featurize:
    featurize_inputs(
    featurize_inputs(
        feature_dir, args.input_files,
        feature_dir, args.input_files,
        args.user_specified_features, args.tasks,
        args.user_specified_features, args.tasks,
@@ -196,14 +192,14 @@ def create_model(args):
  print("+++++++++++++++++++++++++++++++++")
  print("+++++++++++++++++++++++++++++++++")
  print("Perform train-test split")
  print("Perform train-test split")
  paths = [feature_dir]
  paths = [feature_dir]
  if not args.skip_train_test_split:
  if args.train_test_split:
    train_test_split(
    train_test_split(
        paths, args.input_transforms, args.output_transforms, args.feature_types,
        paths, args.input_transforms, args.output_transforms, args.feature_types,
        args.splittype, args.mode, data_dir)
        args.splittype, args.mode, data_dir)


  print("+++++++++++++++++++++++++++++++++")
  print("+++++++++++++++++++++++++++++++++")
  print("Fit model")
  print("Fit model")
  if not args.skip_fit:
  if args.fit:
    model_params = extract_model_params(args)
    model_params = extract_model_params(args)
    fit_model(
    fit_model(
        model_name, model_params, model_dir, data_dir)
        model_name, model_params, model_dir, data_dir)
@@ -211,7 +207,7 @@ def create_model(args):
  print("+++++++++++++++++++++++++++++++++")
  print("+++++++++++++++++++++++++++++++++")
  print("Eval Model on Train")
  print("Eval Model on Train")
  print("-------------------")
  print("-------------------")
  if not args.skip_eval:
  if args.eval:
    csv_out_train = os.path.join(data_dir, "train.csv")
    csv_out_train = os.path.join(data_dir, "train.csv")
    stats_out_train = os.path.join(data_dir, "train-stats.txt")
    stats_out_train = os.path.join(data_dir, "train-stats.txt")
    csv_out_test = os.path.join(data_dir, "test.csv")
    csv_out_test = os.path.join(data_dir, "test.csv")
@@ -222,7 +218,7 @@ def create_model(args):
        stats_out_train, split="train")
        stats_out_train, split="train")
  print("Eval Model on Test")
  print("Eval Model on Test")
  print("------------------")
  print("------------------")
  if not args.skip_eval:
  if args.eval:
    test_dir = os.path.join(data_dir, "test-data")
    test_dir = os.path.join(data_dir, "test-data")
    eval_trained_model(
    eval_trained_model(
        model_name, model_dir, test_dir, csv_out_test,
        model_name, model_dir, test_dir, csv_out_test,
+0 −20
Original line number Original line Diff line number Diff line
@@ -6,18 +6,10 @@ import numpy as np
import pandas as pd
import pandas as pd
from functools import partial
from functools import partial
from rdkit import Chem
from rdkit import Chem
from vs_utils.utils import ScaffoldGenerator
from deepchem.utils.save import save_to_disk
from deepchem.utils.save import save_to_disk
from deepchem.utils.save import load_from_disk
from deepchem.utils.save import load_from_disk
from deepchem.utils.featurize import FeaturizedSamples
from deepchem.utils.featurize import FeaturizedSamples


def generate_scaffold(smiles, include_chirality=False, smiles_field="smiles"):
  """Compute the Bemis-Murcko scaffold for a SMILES string."""
  mol = Chem.MolFromSmiles(smiles)
  engine = ScaffoldGenerator(include_chirality=include_chirality)
  scaffold = engine.get_scaffold(mol)
  return scaffold

class Dataset(object):
class Dataset(object):
  """
  """
  Wrapper class for dataset transformed into X, y, w numpy ndarrays.
  Wrapper class for dataset transformed into X, y, w numpy ndarrays.
@@ -37,19 +29,10 @@ class Dataset(object):
      write_dataset_single_partial = partial(
      write_dataset_single_partial = partial(
          write_dataset_single, data_dir=self.data_dir,
          write_dataset_single, data_dir=self.data_dir,
          feature_types=feature_types)
          feature_types=feature_types)
      print("Dataset()")
      print("data_dir")
      print(data_dir)
      print("len(samples.compounds_df)")
      print(len(samples.compounds_df))


      metadata_rows = []
      metadata_rows = []
      # TODO(rbharath): Still a bit of information leakage.
      # TODO(rbharath): Still a bit of information leakage.
      for df_file, df in zip(samples.dataset_files, samples.itersamples()):
      for df_file, df in zip(samples.dataset_files, samples.itersamples()):
        print("df_file")
        print(df_file)
        print("len(df)")
        print(len(df))
        retval = write_dataset_single_partial((df_file, df))
        retval = write_dataset_single_partial((df_file, df))
        if retval is not None:
        if retval is not None:
          metadata_rows.append(retval)
          metadata_rows.append(retval)
@@ -301,9 +284,6 @@ def df_to_numpy(df, feature_types):
      feature_list.append(datapoint[feature_type])
      feature_list.append(datapoint[feature_type])
    features = np.squeeze(np.concatenate(feature_list))
    features = np.squeeze(np.concatenate(feature_list))
    tensors.append(features)
    tensors.append(features)
  print("df_to_numpy()")
  print("len(df)")
  print(len(df))
  x = np.stack(tensors)
  x = np.stack(tensors)


  # Remove entries with missing labels
  # Remove entries with missing labels
+11 −7
Original line number Original line Diff line number Diff line
@@ -16,6 +16,15 @@ from vs_utils.features.fingerprints import CircularFingerprint
from vs_utils.features.basic import SimpleDescriptors
from vs_utils.features.basic import SimpleDescriptors
from deepchem.utils.save import save_to_disk
from deepchem.utils.save import save_to_disk
from deepchem.utils.save import load_from_disk
from deepchem.utils.save import load_from_disk
from vs_utils.utils import ScaffoldGenerator

def generate_scaffold(smiles, include_chirality=False, smiles_field="smiles"):
  """Compute the Bemis-Murcko scaffold for a SMILES string."""
  mol = Chem.MolFromSmiles(smiles)
  engine = ScaffoldGenerator(include_chirality=include_chirality)
  scaffold = engine.get_scaffold(mol)
  return scaffold



def _process_field(val):
def _process_field(val):
  """Parse data in a field."""
  """Parse data in a field."""
@@ -328,20 +337,15 @@ class FeaturizedSamples(object):
    if splittype == "random":
    if splittype == "random":
      train_inds, test_inds = self._train_test_random_split(seed=seed, frac_train=frac_train)
      train_inds, test_inds = self._train_test_random_split(seed=seed, frac_train=frac_train)
    elif splittype == "scaffold":
    elif splittype == "scaffold":
      train_inds, test_inds = self.train_test_scaffold_split(frac_train=frac_train)
      train_inds, test_inds = self._train_test_scaffold_split(frac_train=frac_train)
    elif splittype == "specified":
    elif splittype == "specified":
      train_inds, test_inds = self.train_test_specified_split()
      train_inds, test_inds = self._train_test_specified_split()
    else:
    else:
      raise ValueError("improper splittype.")
      raise ValueError("improper splittype.")
    print("train_test_split()")
    train_samples = FeaturizedSamples(train_dir, self.dataset_files)
    train_samples = FeaturizedSamples(train_dir, self.dataset_files)
    train_samples._set_compound_df(self.compounds_df.iloc[train_inds])
    train_samples._set_compound_df(self.compounds_df.iloc[train_inds])
    print("len(train_inds)")
    print(len(train_inds))
    test_samples = FeaturizedSamples(test_dir, self.dataset_files)
    test_samples = FeaturizedSamples(test_dir, self.dataset_files)
    test_samples._set_compound_df(self.compounds_df.iloc[test_inds])
    test_samples._set_compound_df(self.compounds_df.iloc[test_inds])
    print("len(test_inds)")
    print(len(test_inds))


    return train_samples, test_samples
    return train_samples, test_samples