Commit a9c75773 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Debugged issue in df_to_numpy

parent 2d1e586f
Loading
Loading
Loading
Loading
+14 −14
Original line number Diff line number Diff line
@@ -51,7 +51,7 @@ class Dataset(object):
                   'y_sums', 'y_sum_squares', 'y_n')) 
      save_to_disk(
        self.metadata_df, self._get_metadata_filename())
      # input/output transforms not specified, so
      # input/output transforms not specified yet, so
      # self.transforms = (input_transforms, output_transforms) =>
      self.transforms = ([], [])
      save_to_disk(
@@ -99,6 +99,9 @@ class Dataset(object):
    """
    return self.metadata_df.shape[0]

  # TODO(rbharath): There is a dangerous mixup in semantics. If itershards() is
  # called without calling transform(), it will explode. Maybe have a separate
  # initialization function to avoid this problem.
  def itershards(self):
    """
    Iterates over all shards in dataset.
@@ -251,7 +254,7 @@ def write_dataset_single(val, data_dir, feature_types):
  if not len(df):
    return None
  task_names = FeaturizedSamples.get_sorted_task_names(df)
  ids, X, y, w = df_to_numpy(df, feature_types)
  ids, X, y, w = _df_to_numpy(df, feature_types)
  X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
  y_sums, y_sum_squares, y_n = compute_sums_and_nb_sample(y, w)

@@ -267,18 +270,21 @@ def write_dataset_single(val, data_dir, feature_types):
  save_to_disk(y, out_y)
  save_to_disk(w, out_w)
  save_to_disk(ids, out_ids)
  # TODO(rbharath): Should X be saved to out_X_transformed as well? Since
  # itershards expects to loop over X-transformed? (Ditto for y/w)
  return([df_file, task_names, out_ids, out_X, out_X_transformed, out_y, 
          out_y_transformed, out_w,
          X_sums, X_sum_squares, X_n, 
          y_sums, y_sum_squares, y_n])

def df_to_numpy(df, feature_types):
def _df_to_numpy(df, feature_types):
  """Transforms a featurized dataset df into standard set of numpy arrays"""
  # perform common train/test split across all tasks
  n_samples = df.shape[0]
  sorted_tasks = FeaturizedSamples.get_sorted_task_names(df)
  n_tasks = len(sorted_tasks)
  y = df[sorted_tasks].values
  y = np.reshape(y, (n_samples, n_tasks))
  w = np.ones((n_samples, n_tasks))
  tensors = []
  for i, datapoint in df.iterrows():
@@ -288,18 +294,12 @@ def df_to_numpy(df, feature_types):
    features = np.squeeze(np.concatenate(feature_list))
    tensors.append(features)
  x = np.stack(tensors)
  sorted_ids = df["mol_id"]

  #TODO(enf/rbharath): This is not compatible with multitask use case.
  nonzero_labels = np.arange(len(y))[[val != '' for val in y]]
  #nonzero_labels = np.squeeze(np.where(y!=''))
  x = x[nonzero_labels]
  y = y[nonzero_labels]
  w = w[nonzero_labels]
  nonzero_rows = []
  for nonzero_ind in nonzero_labels:
    nonzero_rows.append(df.iloc[nonzero_ind])
  nonzero_df = pd.DataFrame(nonzero_rows)
  sorted_ids = nonzero_df["mol_id"]
  # Set missing data to have weight zero
  missing = (y == "")
  y[missing] = 0.
  w[missing] = 0.

  return sorted_ids, x, y, w

+1 −0
Original line number Diff line number Diff line
@@ -91,6 +91,7 @@ class Evaluator(object):
      raise ValueError("Unrecognized task type: %s" % self.task_type)

    performance_df = pd.DataFrame(columns=colnames)
    print("compute_model_performance()")
    y_means = pred_y_df.iterrows().next()[1]["y_means"]
    y_stds = pred_y_df.iterrows().next()[1]["y_stds"]

+67 −1
Original line number Diff line number Diff line
@@ -57,7 +57,73 @@ class TestSingletaskVectorAPI(unittest.TestCase):
                                smiles_field=self.smiles_field,
                                verbose=True)
    feature_file = os.path.join(self.feature_dir, "out.joblib")
    featurizer.featurize(self.input_file, ["ECFP"], feature_file)
    featurizer.featurize(self.input_file, feature_types, feature_file)

    # Transform data into arrays for ML
    samples = FeaturizedSamples(self.samplesdir, [feature_file], reload=False)

    # Split into train/test
    train_samples, test_samples = samples.train_test_split(splittype,
      self.train_dir, self.test_dir)
    train_dataset = Dataset(self.train_dir, train_samples, feature_types)
    test_dataset = Dataset(self.test_dir, test_samples, feature_types)

    # Transforming train/test data
    train_dataset.transform(input_transforms, output_transforms)
    test_dataset.transform(input_transforms, output_transforms)

    # Fit model
    task_types = {task: task_type for task in self.tasks}
    model = Model.model_builder(model_name, task_types, model_params)
    model.fit(train_dataset)
    model.save(self.model_dir)

    # Eval model on train
    evaluator = Evaluator(model, test_dataset, verbose=True)
    with tempfile.NamedTemporaryFile() as test_csv_out:
      with tempfile.NamedTemporaryFile() as test_stats_out:
        evaluator.compute_model_performance(test_csv_out, test_stats_out)

class TestMultitaskVectorAPI(unittest.TestCase):
  """
  Test top-level API for singletask vector models."
  """
  def setUp(self):
    current_dir = os.path.dirname(os.path.abspath(__file__))
    self.input_file = os.path.join(current_dir, "multitask_example.csv")
    self.tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
                  "task7", "task8", "task9", "task10", "task11", "task12",
                  "task13", "task14", "task15", "task16"]
    self.smiles_field="smiles"
    self.feature_dir = tempfile.mkdtemp()
    self.samplesdir = tempfile.mkdtemp()
    self.train_dir = tempfile.mkdtemp()
    self.test_dir = tempfile.mkdtemp()
    self.model_dir = tempfile.mkdtemp()

  def tearDown(self):
    shutil.rmtree(self.feature_dir)
    shutil.rmtree(self.samplesdir)
    shutil.rmtree(self.train_dir)
    shutil.rmtree(self.test_dir)
    shutil.rmtree(self.model_dir)

  def test_API(self):
    """Straightforward test of deepchem API."""
    splittype = "random"
    feature_types = ["ECFP"]
    output_transforms = ["normalize"]
    input_transforms = []
    task_type = "classification"
    model_params = {}
    model_name = "multitask_deep_classifier"

    # Featurize input
    featurizer = DataFeaturizer(tasks=self.tasks,
                                smiles_field=self.smiles_field,
                                verbose=True)
    feature_file = os.path.join(self.feature_dir, "out.joblib")
    featurizer.featurize(self.input_file, feature_types, feature_file)

    # Transform data into arrays for ML
    samples = FeaturizedSamples(self.samplesdir, [feature_file], reload=False)