Commit 7f1bef0c authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Fixed test_load

parent 97b22c5d
Loading
Loading
Loading
Loading
+32 −79
Original line number Diff line number Diff line
@@ -30,10 +30,6 @@ class TestLoad(TestAPI):
    """Test that datasets can be moved and loaded."""
    verbosity = "high"
    current_dir = os.path.dirname(os.path.realpath(__file__))
    feature_dir = os.path.join(self.base_dir, "features")
    moved_feature_dir = os.path.join(self.base_dir, "moved_features")
    samples_dir = os.path.join(self.base_dir, "samples")
    moved_samples_dir = os.path.join(self.base_dir, "moved_samples")
    data_dir = os.path.join(self.base_dir, "data")
    moved_data_dir = os.path.join(self.base_dir, "moved_data")
    dataset_file = os.path.join(
@@ -43,24 +39,16 @@ class TestLoad(TestAPI):
    tasks = ["log-solubility"]
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field="smiles",
                                compound_featurizers=featurizers,
                                featurizers=featurizers,
                                verbosity=verbosity)
    featurized_samples = featurizer.featurize(
        dataset_file, feature_dir,
        samples_dir, reload=reload)
    dataset = Dataset(data_dir=data_dir, samples=featurized_samples, 
                      featurizers=featurizers, tasks=tasks,
                      verbosity=verbosity, reload=reload)
    dataset = featurizer.featurize(
        dataset_file, data_dir)

    X, y, w, ids = dataset.to_numpy()
    shutil.move(feature_dir, moved_feature_dir)
    shutil.move(samples_dir, moved_samples_dir)
    shutil.move(data_dir, moved_data_dir)

    moved_dataset = Dataset(
        data_dir=moved_data_dir, samples=featurized_samples, 
        featurizers=featurizers, tasks=tasks,
        verbosity=verbosity, reload=reload)
        moved_data_dir, reload=reload)

    X_moved, y_moved, w_moved, ids_moved = moved_dataset.to_numpy()

@@ -69,10 +57,12 @@ class TestLoad(TestAPI):
    np.testing.assert_allclose(w, w_moved)
    np.testing.assert_array_equal(ids, ids_moved)

    

  def test_multiload(self):
    """Check can re-use featurization for multiple task selections."""
    """Check can re-use featurization for multiple task selections.

    TODO(rbharath): This test seems silly after the recent round of
                    refactoring. Can it be removed?
    """
    # Only for debug!
    np.random.seed(123)

@@ -83,9 +73,7 @@ class TestLoad(TestAPI):

    current_dir = os.path.dirname(os.path.realpath(__file__))
    #Make directories to store the raw and featurized datasets.
    feature_dir = os.path.join(self.base_dir, "features")
    samples_dir = os.path.join(self.base_dir, "samples")
    full_dir = os.path.join(self.base_dir, "full_dataset")
    data_dir = os.path.join(self.base_dir, "dataset")
    train_dir = os.path.join(self.base_dir, "train_dataset")
    valid_dir = os.path.join(self.base_dir, "valid_dataset")
    test_dir = os.path.join(self.base_dir, "test_dataset")
@@ -107,36 +95,24 @@ class TestLoad(TestAPI):
    ####### Do featurization
    featurizer = DataFeaturizer(tasks=all_tasks,
                                smiles_field="smiles",
                                compound_featurizers=featurizers,
                                featurizers=featurizers,
                                verbosity=verbosity)
    featurized_samples = featurizer.featurize(
        dataset_file, feature_dir,
        samples_dir, shard_size=8192,
        reload=reload)

    full_dataset = Dataset(data_dir=full_dir, samples=featurized_samples, 
                            featurizers=featurizers, tasks=all_tasks,
                            verbosity=verbosity, reload=reload)
    dataset = featurizer.featurize(
        dataset_file, data_dir)

    # Do train/valid split.
    X_multi, y_multi, w_multi, ids_multi = full_dataset.to_numpy()
    X_multi, y_multi, w_multi, ids_multi = dataset.to_numpy()


    ####### Do singletask load
    X_tasks, y_tasks, w_tasks, ids_tasks = [], [], [], []
    for task in all_tasks:
    y_tasks, w_tasks, = [], []
    for ind, task in enumerate(all_tasks):
      print("Processing task %s" % task)
      if os.path.exists(full_dir):
        shutil.rmtree(full_dir)
      full_dataset = Dataset(data_dir=full_dir, samples=featurized_samples, 
                              featurizers=featurizers, tasks=[task],
                              verbosity=verbosity, reload=reload)

      X_task, y_task, w_task, ids_task = full_dataset.to_numpy()
      X_tasks.append(X_task)
      y_tasks.append(y_task)
      w_tasks.append(w_task)
      ids_tasks.append(ids_task)
      dataset = Dataset(data_dir, verbosity=verbosity, reload=reload)

      X_task, y_task, w_task, ids_task = dataset.to_numpy()
      y_tasks.append(y_task[:, ind])
      w_tasks.append(w_task[:, ind])

    ################## Do comparison
    for ind, task in enumerate(all_tasks):
@@ -145,7 +121,6 @@ class TestLoad(TestAPI):

      y_task = y_tasks[ind]
      w_task = w_tasks[ind]
      ids_task = ids_tasks[ind]

      np.testing.assert_allclose(y_multi_task.flatten(), y_task.flatten())
      np.testing.assert_allclose(w_multi_task.flatten(), w_task.flatten())
@@ -163,9 +138,7 @@ class TestLoad(TestAPI):

    current_dir = os.path.dirname(os.path.realpath(__file__))
    #Make directories to store the raw and featurized datasets.
    feature_dir = os.path.join(base_dir, "features")
    samples_dir = os.path.join(base_dir, "samples")
    full_dir = os.path.join(base_dir, "full_dataset")
    data_dir = os.path.join(base_dir, "dataset")
    train_dir = os.path.join(base_dir, "train_dataset")
    valid_dir = os.path.join(base_dir, "valid_dataset")
    test_dir = os.path.join(base_dir, "test_dataset")
@@ -188,48 +161,29 @@ class TestLoad(TestAPI):
    tasks = all_tasks[0:n_tasks]

    ####### Do multitask load
    if os.path.exists(feature_dir):
      shutil.rmtree(feature_dir)
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field="smiles",
                                compound_featurizers=featurizers,
                                featurizers=featurizers,
                                verbosity=verbosity)
    featurized_samples = featurizer.featurize(
        dataset_file, feature_dir,
        samples_dir, shard_size=8192,
        reload=reload)
    if os.path.exists(full_dir):
      shutil.rmtree(full_dir)
    full_dataset = Dataset(data_dir=full_dir, samples=featurized_samples, 
                            featurizers=featurizers, tasks=tasks,
                            verbosity=verbosity, reload=reload)
    dataset = featurizer.featurize(dataset_file, data_dir)

    # Do train/valid split.
    X_multi, y_multi, w_multi, ids_multi = full_dataset.to_numpy()
    X_multi, y_multi, w_multi, ids_multi = dataset.to_numpy()


    ####### Do singletask load
    X_tasks, y_tasks, w_tasks, ids_tasks = [], [], [], []
    y_tasks, w_tasks, ids_tasks = [], [], []
    for task in tasks:
      print("Processing task %s" % task)
      if os.path.exists(feature_dir):
        shutil.rmtree(feature_dir)
      if os.path.exists(data_dir):
        shutil.rmtree(data_dir)
      featurizer = DataFeaturizer(tasks=[task],
                                  smiles_field="smiles",
                                  compound_featurizers=featurizers,
                                  featurizers=featurizers,
                                  verbosity=verbosity)
      featurized_samples = featurizer.featurize(
          dataset_file, feature_dir,
          samples_dir, shard_size=8192,
          reload=reload)
      if os.path.exists(full_dir):
        shutil.rmtree(full_dir)
      full_dataset = Dataset(data_dir=full_dir, samples=featurized_samples, 
                              featurizers=featurizers, tasks=[task],
                              verbosity=verbosity, reload=reload)

      X_task, y_task, w_task, ids_task = full_dataset.to_numpy()
      X_tasks.append(X_task)
      dataset = featurizer.featurize(dataset_file, data_dir)

      X_task, y_task, w_task, ids_task = dataset.to_numpy()
      y_tasks.append(y_task)
      w_tasks.append(w_task)
      ids_tasks.append(ids_task)
@@ -239,7 +193,6 @@ class TestLoad(TestAPI):
      y_multi_task = y_multi[:, ind]
      w_multi_task = w_multi[:, ind]

      #X_task = X_tasks[ind]
      y_task = y_tasks[ind]
      w_task = w_tasks[ind]
      ids_task = ids_tasks[ind]
+1 −1

File changed.

Contains only whitespace changes.