Commit c3d0520f authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Fixed bug in normalization. Still bug in train/test eval.

parent a4ab24f3
Loading
Loading
Loading
Loading
+17 −12
Original line number Diff line number Diff line
@@ -197,6 +197,9 @@ def add_model_command(subparsers):
  model_cmd.add_argument(
      "--skip-fit", action="store_true",
      help="If set, skip model fit step.")
  model_cmd.add_argument(
      "--skip-eval", action="store_true",
      help="If set, skip model eval step.")
  model_cmd.add_argument(
      "--base-dir", type=str, required=1,
      help="The base directory for the model.")
@@ -258,6 +261,7 @@ def create_model(args):
  print("+++++++++++++++++++++++++++++++++")
  print("Eval Model on Train")
  print("-------------------")
  if not args.skip_fit:
    csv_out_train = os.path.join(data_dir, "train.csv")
    stats_out_train = os.path.join(data_dir, "train-stats.txt")
    csv_out_test = os.path.join(data_dir, "test.csv")
@@ -268,6 +272,7 @@ def create_model(args):
        stats_out_train, args.output_transforms, split="train")
  print("Eval Model on Test")
  print("------------------")
  if not args.skip_fit:
    test_dir = os.path.join(data_dir, "test")
    eval_trained_model(
        model_name, model_dir, test_dir, csv_out_test,
+4 −4
Original line number Diff line number Diff line
@@ -397,9 +397,9 @@ def compute_sums_and_nb_sample(tensor, W=None):
    nb_sample = np.shape(tensor)[0]
  else:
    nb_task = np.shape(tensor)[1]
    sums = np.zeros((nb_task))
    sum_squares = np.zeros((nb_task))
    nb_sample = np.zeros((nb_task))
    sums = np.zeros(nb_task)
    sum_squares = np.zeros(nb_task)
    nb_sample = np.zeros(nb_task)
    for task in range(nb_task):
      y_task = tensor[:,task]
      W_task = W[:,task]
@@ -417,7 +417,7 @@ def compute_mean_and_std(df):
  X_sums, X_sum_squares, X_n = (df['X_sums'], 
                                df['X_sum_squares'],
                                df['X_n'])
  n = np.sum(X_n)
  n = float(np.sum(X_n))
  overall_X_sums = np.sum(X_sums, axis=0)
  overall_X_means = overall_X_sums / n
  overall_X_sum_squares = np.sum(X_sum_squares, axis=0)
+6 −1
Original line number Diff line number Diff line
@@ -56,11 +56,16 @@ def compute_model_performance(pred_y_df, task_names, task_type, stats_file, outp
  y_means = pred_y_df.iterrows().next()[1]["y_means"]
  y_stds = pred_y_df.iterrows().next()[1]["y_stds"]

  print("compute_model_performance()")
  for i, task_name in enumerate(task_names):
    y = pred_y_df[task_name]
    y_pred = pred_y_df["%s_pred" % task_name]
    w = pred_y_df["%s_weight" % task_name]
    
    print("y_means")
    print(y_means)
    print("y_stds")
    print(y_stds)
    y = undo_transform(y, y_means, y_stds, output_transforms)
    y_pred = undo_transform(y_pred, y_means, y_stds, output_transforms)

+1 −129
Original line number Diff line number Diff line
@@ -43,31 +43,9 @@ def train_test_split(paths, input_transforms, output_transforms,
  print("Transforming test data.")
  test_arrays.transform_data(input_transforms, output_transforms)

'''
  print("About to train/test split dataset")
  train_files, test_files = get_train_test_files(paths, splittype)
  train_metadata = write_dataset(train_files, data_dir, mode, feature_types)
  train_metadata["split"] = "train"
  test_metadata = write_dataset(test_files, data_dir, mode, feature_types)
  test_metadata["split"] = "test"

  metadata = pd.concat([train_metadata, test_metadata])
  metadata["input_transforms"] = ",".join(input_transforms)
  metadata["output_transforms"] = ",".join(output_transforms)

  metadata = transform_data(metadata, input_transforms, output_transforms)

  metadata_filename = get_metadata_filename(data_dir)
  print("Saving metadata file to %s" % metadata_filename)
  save_to_disk(metadata, metadata_filename)
  print("Saved metadata.")
'''


def undo_normalization(y, y_means, y_stds):
  """Undo the applied normalization transform."""
  y = y * y_means + y_stds
  return y * y_means + y_stds
  return y * y_stds + y_means

def undo_transform(y, y_means, y_stds, output_transforms):
  """Undo transforms on y_pred, W_pred."""
@@ -118,109 +96,3 @@ def multitask_to_singletask(dataset):
        singletask_labels[target].append(labels[target])
  return singletask_features, singletask_labels
'''

#todo(enf/rbharath): completly broken as well.
'''
def split_dataset(dataset, splittype, seed=none):
  """split provided data using specified method."""
  if splittype == "random":
    train, test = train_test_random_split(dataset, seed=seed)
  elif splittype == "scaffold":
    train, test = train_test_scaffold_split(dataset)
  elif splittype == "specified":
    train, test = train_test_specified_split(dataset)
  else:
    raise valueerror("improper splittype.")
  return train, test

def train_test_specified_split(dataset):
  """split provided data due to splits in origin data."""
  train, test = {}, {}
  for mol_id, datapoint in dataset.iteritems():
    if "split" not in datapoint:
      raise valueerror("missing required split information.")
    if datapoint["split"].lower() == "train":
      train[mol_id] = datapoint
    elif datapoint["split"].lower() == "test":
      test[mol_id] = datapoint
  return train, test

def train_test_random_split(dataset, frac_train=.8, seed=none):
  """splits provided data into train/test splits randomly.

  performs a random 80/20 split of the data into train/test. returns two
  dictionaries

  parameters
  ----------
  dataset: dict
    a dictionary of type produced by load_datasets.
  frac_train: float
    proportion of data in train set.
  seed: int (optional)
    seed to initialize np.random.
  """
  np.random.seed(seed)
  shuffled = np.random.permutation(dataset.keys())
  train_cutoff = np.floor(frac_train * len(shuffled))
  train_keys, test_keys = shuffled[:train_cutoff], shuffled[train_cutoff:]
  train, test = {}, {}
  for key in train_keys:
    train[key] = dataset[key]
  for key in test_keys:
    test[key] = dataset[key]
  return train, test

def train_test_scaffold_split(dataset, frac_train=.8):
  """splits provided data into train/test splits by scaffold.

  groups the largest scaffolds into the train set until the size of the
  train set equals frac_train * len(dataset). adds remaining scaffolds
  to test set. the idea is that the test set contains outlier scaffolds,
  and thus serves as a hard test of generalization capability for the
  model.

  parameters
  ----------
  dataset: dict
    a dictionary of type produced by load_datasets.
  frac_train: float
    the fraction (between 0 and 1) of the data to use for train set.
  """
  scaffolds = scaffold_separate(dataset)
  train_size = frac_train * len(dataset)
  train, test = {}, {}
  for elements in scaffolds:
    # if adding this scaffold makes the train_set too big, add to test set.
    if len(train) + len(elements) > train_size:
      for elt in elements:
        test[elt] = dataset[elt]
    else:
      for elt in elements:
        train[elt] = dataset[elt]
  return train, test

def scaffold_separate(dataset):
  """splits provided data by compound scaffolds.

  returns a list of pairs (scaffold, [identifiers]), where each pair
  contains a scaffold and a list of all identifiers for compounds that
  share that scaffold. the list will be sorted in decreasing order of
  number of compounds.

  parameters
  ----------
  dataset: dict
    a dictionary of type produced by load_datasets.
  """
  scaffolds = {}
  for mol_id in dataset:
    datapoint = dataset[mol_id]
    scaffold = datapoint["scaffold"]
    if scaffold not in scaffolds:
      scaffolds[scaffold] = [mol_id]
    else:
      scaffolds[scaffold].append(mol_id)
  # sort from largest to smallest scaffold sets
  return [elt for (scaffold, elt) in sorted(scaffolds.items(), key=lambda x: -len(x[1]))]
'''