Commit a3501081 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #380 from proteneer/unlabelled

Unlabelled dataset support WIP
parents f53da6a5 15b5199d
Loading
Loading
Loading
Loading
+17 −7
Original line number Diff line number Diff line
@@ -20,7 +20,7 @@ from deepchem.utils.save import load_sdf_files
from deepchem.feat import UserDefinedFeaturizer
from deepchem.data import DiskDataset

def convert_df_to_numpy(df, tasks, id_field, verbose=False):
def convert_df_to_numpy(df, tasks, verbose=False):
  """Transforms a dataframe containing deepchem input into numpy arrays"""
  n_samples = df.shape[0]
  n_tasks = len(tasks)
@@ -39,7 +39,7 @@ def convert_df_to_numpy(df, tasks, id_field, verbose=False):
      if y[ind, task] == "":
        missing[ind, task] = 1

  ids = df[id_field].values
  # ids = df[id_field].values
  # Set missing data to have weight zero
  for ind in range(n_samples):
    for task in range(n_tasks):
@@ -47,7 +47,7 @@ def convert_df_to_numpy(df, tasks, id_field, verbose=False):
        y[ind, task] = 0.
        w[ind, task] = 0.

  return ids, y.astype(float), w.astype(float)
  return y.astype(float), w.astype(float)

def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
  """Featurize individual compounds in dataframe.
@@ -152,10 +152,20 @@ class DataLoader(object):
      for shard_num, shard in enumerate(self.get_shards(input_files, shard_size)):
        time1 = time.time()
        X, valid_inds = self.featurize_shard(shard)
        ids, y, w = convert_df_to_numpy(shard, self.tasks, self.id_field)  
        ids = shard[self.id_field].values
        ids = ids[valid_inds]
        if len(self.tasks) > 0:
          # Featurize task results iff they exist.
          y, w = convert_df_to_numpy(shard, self.tasks, self.id_field)  
          # Filter out examples where featurization failed.
        ids, y, w = (ids[valid_inds], y[valid_inds], w[valid_inds])
          y, w = (y[valid_inds], w[valid_inds])
          assert len(X) == len(ids) == len(y) == len(w)
        else:
          # For prospective data where results are unknown, it makes
          # no sense to have y values or weights.
          y, w = (None, None)
          assert len(X) == len(ids)

        time2 = time.time()
        log("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2-time1),
            self.verbose)
+79 −28
Original line number Diff line number Diff line
@@ -410,27 +410,40 @@ class DiskDataset(Dataset):
    metadata_entries should have elements returned by write_data_to_disk
    above.
    """
    columns=('basename','task_names', 'ids', 'X', 'y', 'w')
    metadata_df = pd.DataFrame(
        metadata_entries,
        columns=('basename','task_names', 'ids', 'X', 'y', 'w'))
        columns=columns)
    return metadata_df

  @staticmethod
  def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None,
                         ids=None):
    out_X = "%s-X.joblib" % basename
    out_y = "%s-y.joblib" % basename
    out_w = "%s-w.joblib" % basename
    out_ids = "%s-ids.joblib" % basename

    if X is not None:
      out_X = "%s-X.joblib" % basename
      save_to_disk(X, os.path.join(data_dir, out_X))
    else:
      out_X = None

    if y is not None:
      out_y = "%s-y.joblib" % basename
      save_to_disk(y, os.path.join(data_dir, out_y))
    else:
      out_y = None

    if w is not None:
      out_w = "%s-w.joblib" % basename
      save_to_disk(w, os.path.join(data_dir, out_w))
    else:
      out_w = None

    if ids is not None:
      out_ids = "%s-ids.joblib" % basename
      save_to_disk(ids, os.path.join(data_dir, out_ids))
    else:
      out_ids = None

    # note that this corresponds to the _construct_metadata column order
    return [basename, tasks, out_ids, out_X, out_y, out_w]

  def save_to_disk(self):
@@ -526,15 +539,22 @@ class DiskDataset(Dataset):
      for _, row in dataset.metadata_df.iterrows():
        X = np.array(load_from_disk(
            os.path.join(dataset.data_dir, row['X'])))
        ids = np.array(load_from_disk(
            os.path.join(dataset.data_dir, row['ids'])), dtype=object)
        # These columns may be missing is the dataset is unlabelled.
        if row['y'] is not None:
          y = np.array(load_from_disk(
            os.path.join(dataset.data_dir, row['y'])))
        else:
          y = None
        if row['w'] is not None:
          w_filename = os.path.join(dataset.data_dir, row['w'])
          if os.path.exists(w_filename):
              w = np.array(load_from_disk(w_filename))
          else:
              w = np.ones(y.shape)
        ids = np.array(load_from_disk(
            os.path.join(dataset.data_dir, row['ids'])), dtype=object)
        else:
          w = None
        yield (X, y, w, ids)
    return iterate(self)

@@ -571,8 +591,17 @@ class DiskDataset(Dataset):
          indices = range(interval_points[j], interval_points[j+1])
          perm_indices = sample_perm[indices]
          X_batch = X[perm_indices]

          if y is not None:
            y_batch = y[perm_indices]
          else:
            y_batch = None

          if w is not None:
            w_batch = w[perm_indices]
          else:
            w_batch = None

          ids_batch = ids[perm_indices]
          if pad_batches:
            (X_batch, y_batch, w_batch, ids_batch) = pad_batch(
@@ -592,7 +621,12 @@ class DiskDataset(Dataset):
        for (X_shard, y_shard, w_shard, ids_shard) in dataset.itershards():
            n_samples = X_shard.shape[0]
            for i in range(n_samples):
                yield (X_shard[i], y_shard[i], w_shard[i], ids_shard[i])
                def sanitize(elem):
                  if elem is None:
                    return None
                  else:
                    return elem[i]
                yield map(sanitize, [X_shard, y_shard, w_shard, ids_shard])
    return iterate(self)

  def transform(self, fn, **args):
@@ -750,13 +784,23 @@ class DiskDataset(Dataset):
    row = self.metadata_df.iloc[i]
    X = np.array(load_from_disk(
        os.path.join(self.data_dir, row['X'])))

    if row['y'] is not None:
      y = np.array(load_from_disk(
        os.path.join(self.data_dir, row['y'])))
    else:
      y = None

    if row['w'] is not None:
      # TODO (ytz): Under what condition does this exist but the file itself doesn't?
      w_filename = os.path.join(self.data_dir, row['w'])
      if os.path.exists(w_filename):
          w = np.array(load_from_disk(w_filename))
      else:
          w = np.ones(y.shape)
    else:
      w = None

    ids = np.array(load_from_disk(
        os.path.join(self.data_dir, row['ids'])), dtype=object)
    return (X, y, w, ids)
@@ -871,7 +915,7 @@ class DiskDataset(Dataset):
    """
    total = 0
    for _, row in self.metadata_df.iterrows():
      y = load_from_disk(os.path.join(self.data_dir, row['y']))
      y = load_from_disk(os.path.join(self.data_dir, row['ids']))
      total += len(y)
    return total

@@ -879,17 +923,24 @@ class DiskDataset(Dataset):
    """Finds shape of dataset."""
    n_tasks = len(self.get_task_names())
    X_shape = np.array((0,) + (0,) * len(self.get_data_shape())) 
    ids_shape = np.array((0,))
    if n_tasks > 0:
      y_shape = np.array((0,) + (0,))
      w_shape = np.array((0,) + (0,))
    ids_shape = np.array((0,))
    else:
      y_shape = tuple()
      w_shape = tuple()

    for shard_num, (X, y, w, ids) in enumerate(self.itershards()):
      if shard_num == 0:
        X_shape += np.array(X.shape)
        if n_tasks > 0:
          y_shape += np.array(y.shape)
          w_shape += np.array(w.shape)
        ids_shape += np.array(ids.shape)
      else:
        X_shape[0] += np.array(X.shape)[0]
        if n_tasks > 0:
          y_shape[0] += np.array(y.shape)[0]
          w_shape[0] += np.array(w.shape)[0]
        ids_shape[0] += np.array(ids.shape)[0]
+9 −0
Original line number Diff line number Diff line
@@ -91,3 +91,12 @@ def load_gaussian_cdf_data():
  loader = dc.data.UserCSVLoader(
      tasks=tasks, featurizer=featurizer, id_field="id")
  return loader.featurize(input_file)

def load_unlabelled_data():
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = []
  input_file = os.path.join(current_dir, "../../data/tests/no_labels.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)
 No newline at end of file
+26 −0
Original line number Diff line number Diff line
smiles,id
O=C1CCc2c(N1)[c-]c([c-][c-]2)OCCCC[N+]1([O-])CCN(CC1)c1[c-][c-][c-]c(c1Cl)Cl,48866084_50429806
O=C1CCc2c(N1)[c-]c([c-][c-]2)OCCCCN1CC[N+](CC1)([O-])c1[c-][c-][c-]c(c1Cl)Cl,48866086_50429808
CO[C@H]1O[C@H]2O[C@]3(C)CC[C@H]4[C@@]2([C@@H]([C@H]1C)CC[C@@H]4C)OO3,48866088_48866087
O=C1O[C@@H]2O[C@]3(C)CC[C@H]4[C@@]2([C@H]([C@@H]1C)CC[C@@H]4C)OO3,48866090_48866089
O=C1O[C@@H]2O[C@]3(C)CC[C@H]4[C@@]2([C@H](C1=C)CC[C@@H]4C)OO3,48866092_48866091
OCC1O[C@@H](O[C@@H]2C[C@@H](C(=O)O)[C@@H]3[C@](C2)(C)[C@@H]2CC[C@@H]4C[C@@]2(CC3)[C@@H](O)C4=C)C(C([C@@H]1OS(=O)(=O)[O-])OS(=O)(=O)[O-])OC(=O)CC(C)C.[Na+].[Na+],48866104_48866103
OC1C[C@@H](O[C@@H]1COP(=O)(O)O)n1cnc(nc1=O)N,48866106_48866105
C/C=C(/C(=O)OC1C[C@H](OC(=O)C)C2([C@@H]3[C@@]41CO[C@@]([C@H]4[C@@](C)([C@H]([C@H]3OC2)O)[C@@]12OC2(C)C2CC1O[C@@H]1C2(O)C=CO1)(O)C(=O)OC)C(=O)OC)\C,48866108_48866107
CN1CCC(=C2c3[c-][c-][c-][c-]c3CCc3c2n[c-][c-][c-]3)CC1.OC(=O)/C=C\C(=O)O,48866111_33542275
Clc1[c-][c-]c([c-][c-]1)Cc1nn(C2CCC[N+](CC2)([O-])C)c(=O)c2c1[c-][c-][c-][c-]2,48866115_48866114
CC[C@@H]1OC(=O)[C@H](C)[C@H](OC2OC(C)C(C(C2)(C)OC)O)[C@@H](C)[C@H](OC2OC(C)CC(C2O)[N+](C)(C)[O-])[C@](C[C@@H](CN([C@@H]([C@H](C1(C)O)O)C)C)C)(C)O,48866130_48866129
CO/C=C(\c1[c-][c-][c-][c-]c1Oc1n[c-]nc([c-]1)Oc1[c-][c-][c-][c-]c1C#N)/C(=O)OC,48866134_207297540
COC(=O)C1=C(C)NC(=C([C@@H]1c1cccc(c1)[N+](=O)[O-])C(=O)O[C@H]1CCN(C1)Cc1ccccc1)C.Cl,48866140_48866139
O=S1(=O)N[C@H](Cc2[c-][c-][c-][c-][c-]2)Nc2c1[c-]c(c([c-]2)C(F)(F)F)S(=O)(=O)N,48866148_48866147
O=S1(=O)N[C@@H](Cc2[c-][c-][c-][c-][c-]2)Nc2c1[c-]c(c([c-]2)C(F)(F)F)S(=O)(=O)N,48866150_48866149
[c-]1[c-][c-]c([c-][c-]1)/C=N/N=C/c1[c-][c-][c-][c-][c-]1,48866152_48866151
O=C(c1[c-][c-][c-][c-][c-]1)NOCC(=O)O,48866154_48866153
CC(CC(c1[c-][c-]c([c-][c-]1)OCCOCC[N+](Cc1[c-][c-][c-][c-][c-]1)(C)C)(C)C)(C)C.[Cl-],48866156_515814
O=C1CN(C1)C(c1[c-][c-][c-][c-][c-]1)c1[c-][c-][c-][c-][c-]1,48866158_48866157
OC(=O)c1[c-][c-]c2c([c-]1)n[c-]n2,48866160_48866159
Cc1c(OCC(F)(F)F)[c-][c-]n2c1c(Sc1nc3c(n1)[c-][c-][c-][c-]3)n1c2nc2c1[c-][c-][c-][c-]2,48866162_48866161
CCc1oc2c(c1C(=O)c1[c-]c(I)c(c([c-]1)I)O)[c-][c-][c-][c-]2,48866164_48866163
[c-]1[c-]c2[c-]c3c4[c-][c-][c-][c-]c4[c-][c-]c3c3c2c([c-]1)[C-]=[C-]3.[c-]1[c-][c-]c2c([c-]1)[c-]c1c3c2[C-]=[C-]c3[c-]c2c1[c-][c-][c-][c-]2,48866166_48866165
O=C1CC(=O)Nc2c(N1)[c-][c-][c-][c-]2,48866168_48866167
ClCC(=O)N1[C@@H](Cc2c([C@H]1c1[c-][c-]c3c([c-]1)OCO3)nc1c2[c-][c-][c-][c-]1)C(=O)OC,48866170_207350992
+9 −0
Original line number Diff line number Diff line
@@ -23,6 +23,15 @@ class TestDataLoader(unittest.TestCase):
    super(TestDataLoader, self).setUp()
    self.current_dir = os.path.dirname(os.path.abspath(__file__))

  def unlabelled_test(self):
    input_file = os.path.join(
        self.current_dir, "../../data/tests/no_labels.csv")
    featurizer = dc.feat.CircularFingerprint(size=1024)
    loader = dc.data.CSVLoader(
        tasks=[], smiles_field="smiles",
        featurizer=featurizer)
    loader.featurize(input_file)

  def scaffold_test_train_valid_test_split(self):
    """Test of singletask RF ECFP regression API."""
    splittype = "scaffold"
Loading