Commit cad42325 authored by Yutong Zhao's avatar Yutong Zhao
Browse files

Unlabelled dataset support WIP

Additional improvements to support of unlabelled datasets.
parent 73d71f26
Loading
Loading
Loading
Loading
+17 −7
Original line number Diff line number Diff line
@@ -20,7 +20,7 @@ from deepchem.utils.save import load_sdf_files
from deepchem.feat import UserDefinedFeaturizer
from deepchem.data import DiskDataset

def convert_df_to_numpy(df, tasks, id_field, verbose=False):
def convert_df_to_numpy(df, tasks, verbose=False):
  """Transforms a dataframe containing deepchem input into numpy arrays"""
  n_samples = df.shape[0]
  n_tasks = len(tasks)
@@ -39,7 +39,7 @@ def convert_df_to_numpy(df, tasks, id_field, verbose=False):
      if y[ind, task] == "":
        missing[ind, task] = 1

  ids = df[id_field].values
  # ids = df[id_field].values
  # Set missing data to have weight zero
  for ind in range(n_samples):
    for task in range(n_tasks):
@@ -47,7 +47,7 @@ def convert_df_to_numpy(df, tasks, id_field, verbose=False):
        y[ind, task] = 0.
        w[ind, task] = 0.

  return ids, y.astype(float), w.astype(float)
  return y.astype(float), w.astype(float)

def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
  """Featurize individual compounds in dataframe.
@@ -152,10 +152,20 @@ class DataLoader(object):
      for shard_num, shard in enumerate(self.get_shards(input_files, shard_size)):
        time1 = time.time()
        X, valid_inds = self.featurize_shard(shard)
        ids, y, w = convert_df_to_numpy(shard, self.tasks, self.id_field)  
        ids = shard[self.id_field].values
        ids = ids[valid_inds]
        if len(self.tasks) > 0:
          # Featurize task results iff they exist.
          y, w = convert_df_to_numpy(shard, self.tasks, self.id_field)  
          # Filter out examples where featurization failed.
        ids, y, w = (ids[valid_inds], y[valid_inds], w[valid_inds])
          y, w = (y[valid_inds], w[valid_inds])
          assert len(X) == len(ids) == len(y) == len(w)
        else:
          # For prospective data where results are unknown, it makes
          # no sense to have y values or weights.
          y, w = (None, None)
          assert len(X) == len(ids)

        time2 = time.time()
        log("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2-time1),
            self.verbose)
+75 −27
Original line number Diff line number Diff line
@@ -410,27 +410,42 @@ class DiskDataset(Dataset):
    metadata_entries should have elements returned by write_data_to_disk
    above.
    """
    if len(metadata_entries) == 0:
      raise Exception("No metadata entries.")
    columns=('basename','task_names', 'ids', 'X', 'y', 'w')
    metadata_df = pd.DataFrame(
        metadata_entries,
        columns=('basename','task_names', 'ids', 'X', 'y', 'w'))
        columns=columns)
    return metadata_df

  @staticmethod
  def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None,
                         ids=None):
    out_X = "%s-X.joblib" % basename
    out_y = "%s-y.joblib" % basename
    out_w = "%s-w.joblib" % basename
    out_ids = "%s-ids.joblib" % basename

    if X is not None:
      out_X = "%s-X.joblib" % basename
      save_to_disk(X, os.path.join(data_dir, out_X))
    else:
      out_X = None

    if y is not None:
      out_y = "%s-y.joblib" % basename
      save_to_disk(y, os.path.join(data_dir, out_y))
    else:
      out_y = None

    if w is not None:
      out_w = "%s-w.joblib" % basename
      save_to_disk(w, os.path.join(data_dir, out_w))
    else:
      out_w = None

    if ids is not None:
      out_ids = "%s-ids.joblib" % basename
      save_to_disk(ids, os.path.join(data_dir, out_ids))
    else:
      out_ids = None

    # note that this corresponds to the _construct_metadata column order
    return [basename, tasks, out_ids, out_X, out_y, out_w]

  def save_to_disk(self):
@@ -526,15 +541,22 @@ class DiskDataset(Dataset):
      for _, row in dataset.metadata_df.iterrows():
        X = np.array(load_from_disk(
            os.path.join(dataset.data_dir, row['X'])))
        ids = np.array(load_from_disk(
            os.path.join(dataset.data_dir, row['ids'])), dtype=object)
        # These columns may be missing is the dataset is unlabelled.
        if row['y'] is not None:
          y = np.array(load_from_disk(
            os.path.join(dataset.data_dir, row['y'])))
        else:
          y = None
        if row['w'] is not None:
          w_filename = os.path.join(dataset.data_dir, row['w'])
          if os.path.exists(w_filename):
              w = np.array(load_from_disk(w_filename))
          else:
              w = np.ones(y.shape)
        ids = np.array(load_from_disk(
            os.path.join(dataset.data_dir, row['ids'])), dtype=object)
        else:
          w = None
        yield (X, y, w, ids)
    return iterate(self)

@@ -571,8 +593,17 @@ class DiskDataset(Dataset):
          indices = range(interval_points[j], interval_points[j+1])
          perm_indices = sample_perm[indices]
          X_batch = X[perm_indices]

          if y is not None:
            y_batch = y[perm_indices]
          else:
            y_batch = None

          if w is not None:
            w_batch = w[perm_indices]
          else:
            w_batch = None

          ids_batch = ids[perm_indices]
          if pad_batches:
            (X_batch, y_batch, w_batch, ids_batch) = pad_batch(
@@ -750,13 +781,23 @@ class DiskDataset(Dataset):
    row = self.metadata_df.iloc[i]
    X = np.array(load_from_disk(
        os.path.join(self.data_dir, row['X'])))

    if row['y'] is not None:
      y = np.array(load_from_disk(
        os.path.join(self.data_dir, row['y'])))
    else:
      y = None

    if row['w'] is not None:
      # TODO (ytz): Under what condition does this exist but the file itself doesn't?
      w_filename = os.path.join(self.data_dir, row['w'])
      if os.path.exists(w_filename):
          w = np.array(load_from_disk(w_filename))
      else:
          w = np.ones(y.shape)
    else:
      w = None

    ids = np.array(load_from_disk(
        os.path.join(self.data_dir, row['ids'])), dtype=object)
    return (X, y, w, ids)
@@ -871,7 +912,7 @@ class DiskDataset(Dataset):
    """
    total = 0
    for _, row in self.metadata_df.iterrows():
      y = load_from_disk(os.path.join(self.data_dir, row['y']))
      y = load_from_disk(os.path.join(self.data_dir, row['ids']))
      total += len(y)
    return total

@@ -879,17 +920,24 @@ class DiskDataset(Dataset):
    """Finds shape of dataset."""
    n_tasks = len(self.get_task_names())
    X_shape = np.array((0,) + (0,) * len(self.get_data_shape())) 
    ids_shape = np.array((0,))
    if n_tasks > 0:
      y_shape = np.array((0,) + (0,))
      w_shape = np.array((0,) + (0,))
    ids_shape = np.array((0,))
    else:
      y_shape = tuple()
      w_shape = tuple()

    for shard_num, (X, y, w, ids) in enumerate(self.itershards()):
      if shard_num == 0:
        X_shape += np.array(X.shape)
        if n_tasks > 0:
          y_shape += np.array(y.shape)
          w_shape += np.array(w.shape)
        ids_shape += np.array(ids.shape)
      else:
        X_shape[0] += np.array(X.shape)[0]
        if n_tasks > 0:
          y_shape[0] += np.array(y.shape)[0]
          w_shape[0] += np.array(w.shape)[0]
        ids_shape[0] += np.array(ids.shape)[0]