Commit 5273d873 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Passing singletask/multitask test suite.

parent 82f05b69
Loading
Loading
Loading
Loading
+12 −17
Original line number Diff line number Diff line
@@ -117,7 +117,6 @@ class Model(object):
              "model_type": self.model_type}
    save_to_disk(params, Model.get_params_filename(out_dir))

  # TODO(rbharath): This training is currently broken w.r.t minibatches! Fix.
  def fit(self, dataset):
    """
    Fits a model on data in a Dataset object.
@@ -130,7 +129,8 @@ class Model(object):
      for i, (X, y, w, _) in enumerate(dataset.itershards()):
        print("Training on batch-%s/epoch-%s" % (str(i+1), str(epoch+1)))
        nb_sample = np.shape(X)[0]
        interval_points = np.linspace(0,nb_sample, np.ceil(float(nb_sample)/batch_size)+1).astype(int)
        interval_points = np.linspace(
            0, nb_sample, np.ceil(float(nb_sample)/batch_size)+1).astype(int)
        for j in range(len(interval_points)-1):
          indices = range(interval_points[j],interval_points[j+1])
          X_batch = X[indices, :]
@@ -154,21 +154,16 @@ class Model(object):
                           + ["y_means", "y_stds"])
    pred_y_df = pd.DataFrame(columns=column_names)

    # TODO(rbharath/enf): This is only for GPU models, and is currently depends
    # on magic numbers.
    MAX_GPU_RAM = float(691007488/50)
    batch_size = self.model_params["batch_size"]
    for (X, y, w, ids) in dataset.itershards():
      if sys.getsizeof(X) > MAX_GPU_RAM:
        nb_block = float(sys.getsizeof(X))/MAX_GPU_RAM
      nb_sample = np.shape(X)[0]
        interval_points = np.linspace(0,nb_sample,nb_block+1).astype(int)
      interval_points = np.linspace(
          0, nb_sample, np.ceil(float(nb_sample)/batch_size)+1).astype(int)
      y_preds = []
      for j in range(0,len(interval_points)-1):
        indices = range(interval_points[j],interval_points[j+1])
        y_preds.append(self.predict_on_batch(X[indices,:]))
      y_pred = np.concatenate(y_preds)
      else:
        y_pred = self.predict_on_batch(X)
      y_pred = np.reshape(y_pred, np.shape(y))

      shard_df = pd.DataFrame(columns=column_names)
+8 −1
Original line number Diff line number Diff line
@@ -136,8 +136,15 @@ class MultiTaskDNN(KerasModel):
    nb_tasks = len(sorted_tasks)
    y_pred = np.zeros((nb_samples, nb_tasks))
    for ind, task in enumerate(sorted_tasks):
      task_type = self.task_types[task]
      taskname = "task%d" % ind
      y_pred[:,ind] = np.squeeze(y_pred_dict[taskname])
      if task_type == "classification":
        # Class probabilities are predicted for classification outputs. Instead,
        # output the most likely class.
        y_pred_task = np.squeeze(np.argmax(y_pred_dict[taskname], axis=1))
      else:
        y_pred_task = np.squeeze(y_pred_dict[taskname])
      y_pred[:,ind] = y_pred_task
    y_pred = np.squeeze(y_pred)
    return y_pred

+1 −1
Original line number Diff line number Diff line
@@ -59,7 +59,7 @@ class SklearnModel(Model):
      Xs.append(X)
      ys.append(y)
    X = np.concatenate(Xs)
    y = np.concatenate(ys)
    y = np.concatenate(ys).ravel()
    self.raw_model.fit(X, y)

  def predict_on_batch(self, X):
+22 −18
Original line number Diff line number Diff line
@@ -13,6 +13,10 @@ from deepchem.utils.save import save_to_disk
from deepchem.utils.save import load_from_disk
from deepchem.utils.featurize import FeaturizedSamples

# TODO(rbharath): The semantics of this class are very difficult to debug.
# Multiple transformations of the data are performed on disk, and computations
# of mean/std are spread across multiple functions for efficiency. Some
# refactoring needs to happen here.
class Dataset(object):
  """
  Wrapper class for dataset transformed into X, y, w numpy ndarrays.
@@ -108,7 +112,6 @@ class Dataset(object):
    """
    nb_shards = self.get_number_shards()
    for i, row in self.metadata_df.iterrows():
      print("Loading shard %d out of %d" % (i+1, nb_shards))
      X = load_from_disk(row['X-transformed'])
      y = load_from_disk(row['y-transformed'])
      w = load_from_disk(row['w'])
@@ -133,7 +136,6 @@ class Dataset(object):

    # Store input_transforms/output_transforms so the dataset remembers its state.

    print("Transforming data.")
    X_means, X_stds, y_means, y_stds = self._transform(normalize_X, normalize_y,
                                                       truncate_x, truncate_y,
                                                       log_X, log_y,
@@ -193,10 +195,9 @@ def _transform_row(i, df, normalize_X, normalize_y, truncate_X, truncate_y,
  X = load_from_disk(row['X'])
  if normalize_X or log_X:
    if normalize_X:
      print("Normalizing X sample %d out of %d" % (i+1,total))
      # Turns NaNs to zeros
      X = np.nan_to_num((X - X_means) / X_stds)
      if truncate_X:
         print("Truncating X sample %d out of %d" % (i+1,total))
         X[X > trunc] = trunc
         X[X < (-1.0*trunc)] = -1.0 * trunc
    if log_X:
@@ -249,7 +250,6 @@ def compute_sums_and_nb_sample(tensor, W=None):

def write_dataset_single(val, data_dir, feature_types):
  (df_file, df) = val
  print("Examining %s" % df_file)
  # TODO(rbharath): This is a hack. clean up.
  if not len(df):
    return None
@@ -301,29 +301,33 @@ def _df_to_numpy(df, feature_types):
  y[missing] = 0.
  w[missing] = 0.

  return sorted_ids, x, y, w
  return sorted_ids, x.astype(float), y.astype(float), w.astype(float)


def compute_mean_and_std(df):
  """
  Compute means/stds of X/y from sums/sum_squares of tensors.
  """
  X_sums, X_sum_squares, X_n = (df['X_sums'], 
                                df['X_sum_squares'],
                                df['X_n'])
  X_sums, X_sum_squares, X_n = (list(df['X_sums']), 
                                list(df['X_sum_squares']),
                                list(df['X_n']))
  # Note that X_n is a list of floats
  n = float(np.sum(X_n))
  X_sums = np.vstack(X_sums)
  X_sum_squares = np.vstack(X_sum_squares)
  overall_X_sums = np.sum(X_sums, axis=0)
  overall_X_means = overall_X_sums / n
  overall_X_sum_squares = np.sum(X_sum_squares, axis=0)

  X_vars = (overall_X_sum_squares - np.square(overall_X_sums)/n)/(n)

  y_sums, y_sum_squares, y_n = (df['y_sums'].values, 
                                df['y_sum_squares'].values,
                                df['y_n'].values)
  y_sums, y_sum_squares, y_n = (list(df['y_sums']), 
                                list(df['y_sum_squares']),
                                list(df['y_n']))
  # Note y_n is a list of arrays of shape (n_tasks,)
  y_n = np.sum(y_n, axis=0)
  y_sums = np.vstack(y_sums)
  y_sum_squares = np.vstack(y_sum_squares)
  n = float(np.sum(y_n))
  y_means = np.sum(y_sums, axis=0)/n
  y_vars = np.sum(y_sum_squares,axis=0)/n - np.square(y_means)
  y_means = np.sum(y_sums, axis=0)/y_n
  y_vars = np.sum(y_sum_squares, axis=0)/y_n - np.square(y_means)
  return overall_X_means, np.sqrt(X_vars), y_means, np.sqrt(y_vars)
+12 −10
Original line number Diff line number Diff line
@@ -84,31 +84,33 @@ class Evaluator(object):
    pred_y_df.to_csv(csv_out)

    if self.task_type == "classification":
      colnames = ["task_name", "roc_auc_score", "matthews_corrcoef", "recall_score", "accuracy_score"]
      colnames = ["task_name", "roc_auc_score", "matthews_corrcoef",
                  "recall_score", "accuracy_score"]
    elif self.task_type == "regression":
      colnames = ["task_name", "r2_score", "rms_error"]
    else:
      raise ValueError("Unrecognized task type: %s" % self.task_type)

    performance_df = pd.DataFrame(columns=colnames)
    print("compute_model_performance()")
    y_means = pred_y_df.iterrows().next()[1]["y_means"]
    y_stds = pred_y_df.iterrows().next()[1]["y_stds"]

    for i, task_name in enumerate(self.task_names):
      y = pred_y_df[task_name]
      y_pred = pred_y_df["%s_pred" % task_name]
      w = pred_y_df["%s_weight" % task_name]
      
      y = pred_y_df[task_name].values
      y_pred = pred_y_df["%s_pred" % task_name].values
      w = pred_y_df["%s_weight" % task_name].values
      y = undo_transform(y, y_means, y_stds, self.output_transforms)
      y_pred = undo_transform(y_pred, y_means, y_stds, self.output_transforms)

      if self.task_type == "classification":
        y, y_pred = y[w.nonzero()], y_pred[w.nonzero()][:, 1]
        y, y_pred = y[w.nonzero()].astype(int), y_pred[w.nonzero()].astype(int)
        # Sometimes all samples have zero weight. In this case, continue.
        if not len(y):
          continue
        auc = compute_roc_auc_scores(y, y_pred, w)
        mcc = matthews_corrcoef(y, np.around(y_pred))
        recall = recall_score(y, np.around(y_pred))
        accuracy = accuracy_score(y, np.around(y_pred))
        mcc = matthews_corrcoef(y, y_pred)
        recall = recall_score(y, y_pred)
        accuracy = accuracy_score(y, y_pred)
        performance_df.loc[i] = [task_name, auc, mcc, recall, accuracy]

      elif self.task_type == "regression":
Loading