Commit d073d041 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Cleanup

parent ed4a3f7c
Loading
Loading
Loading
Loading
+0 −40
Original line number Diff line number Diff line
@@ -11,7 +11,6 @@ import multiprocessing as mp
from functools import partial
from deepchem.utils.save import save_to_disk
from deepchem.utils.save import load_from_disk
#from deepchem.featurizers.featurize import FeaturizedSamples
from deepchem.utils.save import log

__author__ = "Bharath Ramsundar"
@@ -82,15 +81,6 @@ class Dataset(object):
      assert X.shape[0] == y.shape[0]
      assert y.shape == w.shape
      assert len(ids) == X.shape[0]
    ############################### DEBUG
    #print("Dataset.write_dataframe()")
    #print("X is None, y is None, w is None")
    #print(X is None, y is None, w is None)
    #print("basename, data_dir")
    #print(basename, data_dir)
    #print("os.path.exists(data_dir)")
    #print(os.path.exists(data_dir))
    ############################### DEBUG
    return Dataset.write_data_to_disk(data_dir, basename, tasks, X, y, w, ids)

  @staticmethod
@@ -124,22 +114,8 @@ class Dataset(object):
    out_w = "%s-w.joblib" % basename
    out_w_transformed = "%s-w-transformed.joblib" % basename
    out_ids = "%s-ids.joblib" % basename
    ############################### DEBUG
    #print("Dataset.write_data_to_disk()")
    #print("X is None, y is None, w is None")
    #print(X is None, y is None, w is None)
    ##import traceback
    ##traceback.print_stack()
    ############################### DEBUG

    if X is not None:
      ############################### DEBUG
      #print("Dataset.write_data_to_disk()")
      #print("basename")
      #print(basename)
      #print("os.path.join(data_dir, out_X)")
      #print(os.path.join(data_dir, out_X))
      ############################### DEBUG
      save_to_disk(X, os.path.join(data_dir, out_X))
      save_to_disk(X, os.path.join(data_dir, out_X_transformed))
      X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
@@ -147,12 +123,6 @@ class Dataset(object):
      save_to_disk(X_sum_squares, os.path.join(data_dir, out_X_sum_squares))
      save_to_disk(X_n, os.path.join(data_dir, out_X_n))
    if y is not None:
      ############################### DEBUG
      #print("Dataset.write_data_to_disk()")
      #print("Writing y to")
      #print("os.path.join(data_dir, out_y_transformed)")
      #print(os.path.join(data_dir, out_y_transformed))
      ############################### DEBUG
      save_to_disk(y, os.path.join(data_dir, out_y))
      save_to_disk(y, os.path.join(data_dir, out_y_transformed))
      y_sums, y_sum_squares, y_n = compute_sums_and_nb_sample(y, w)
@@ -321,11 +291,6 @@ class Dataset(object):
    """
    total = 0
    for _, row in self.metadata_df.iterrows():
      ###################################### DEBUG
      #print("Dataset.__len__()")
      #print("self.data_dir, row['y-transformed']")
      #print(self.data_dir, row['y-transformed'])
      ###################################### DEBUG
      y = load_from_disk(os.path.join(self.data_dir, row['y-transformed']))
      total += len(y)
    return total
@@ -491,11 +456,6 @@ def _df_to_numpy(df, feature_types, tasks):
      missing[ind, :] = 1
      continue
    tensors.append(features)
  ################################################## DEBUG
  #print("_df_to_numpy")
  #print("tensors, n_samples, feature_types")
  #print(tensors, n_samples, feature_types)
  ################################################## DEBUG
  x = np.stack(tensors)
  sorted_ids = df["mol_id"]

+0 −4
Original line number Diff line number Diff line
@@ -30,10 +30,6 @@ class TestBasicDatasetAPI(TestDatasetAPI):
    assert solubility_dataset.get_task_names() == ["log-solubility"]

    multitask_dataset = self.load_multitask_data()
    ############################################ DEBUG
    print("multitask_dataset.get_task_names()")
    print(multitask_dataset.get_task_names())
    ############################################ DEBUG
    assert sorted(multitask_dataset.get_task_names()) == sorted(["task0",
        "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8",
        "task9", "task10", "task11", "task12", "task13", "task14", "task15",
+0 −10
Original line number Diff line number Diff line
@@ -198,11 +198,6 @@ class DataFeaturizer(object):
      basename = "shard-%d" % j
      metadata_rows.append(write_dataframe_partial((basename, df)))

    ################################################## DEBUG
    print("DataFeaturizer.featurize()")
    #print("data_dir, len(metadata_rows)")
    #print(data_dir, len(metadata_rows))
    ################################################## DEBUG
    dataset = Dataset(data_dir=data_dir,
                      metadata_rows=metadata_rows,
                      reload=reload, verbosity=self.verbosity)
@@ -313,11 +308,6 @@ class DataFeaturizer(object):
          mol = elem
        if ind % self.log_every_n == 0:
          log("Featurizing sample %d" % ind, self.verbosity)
        ###################################### DEBUG
        #print("DataFeaturizer._featurize_mol")
        #print("mol, self.verbosity")
        #print(mol, self.verbosity)
        ###################################### DEBUG
        features.append(featurizer.featurize([mol], verbosity=self.verbosity))
    else:
      def featurize_wrapper(elem, dilled_featurizer):
+0 −49
Original line number Diff line number Diff line
@@ -26,45 +26,6 @@ class TestFeaturizedSamples(TestAPI):
  """
  Test Featurized Samples class.
  """
  #def _featurize_train_valid_test_split(self, splittype, input_file, tasks,
  #                                      frac_train, frac_valid, frac_test):
  #  # Featurize input
  #  compound_featurizers = [CircularFingerprint(size=1024)]
  #  complex_featurizers = []
  #  featurizers = compound_featurizers + complex_featurizers

  #  input_file = os.path.join(self.current_dir, input_file)
  #  featurizer = DataFeaturizer(tasks=tasks,
  #                              smiles_field=self.smiles_field,
  #                              compound_featurizers=compound_featurizers,
  #                              complex_featurizers=complex_featurizers,
  #                              verbosity="low")

  #  #Featurizes samples and transforms them into NumPy arrays suitable for ML.
  #  #returns an instance of class FeaturizedSamples()

  #  samples = featurizer.featurize(input_file, self.feature_dir, self.samples_dir)

  #  # Splits featurized samples into train/test
  #  assert splittype in ["random", "specified", "scaffold"]
  #  if splittype == "random":
  #    splitter = RandomSplitter()
  #  elif splittype == "specified":
  #    splitter = SpecifiedSplitter()
  #  elif splittype == "scaffold":
  #    splitter = ScaffoldSplitter()
  #  if frac_valid > 0:
  #    train_samples, valid_samples, test_samples = splitter.train_valid_test_split(
  #        samples, train_dir=self.train_dir, valid_dir=self.valid_dir,
  #        test_dir=self.test_dir, frac_train=frac_train,
  #        frac_valid=frac_valid, frac_test=frac_test)

  #    return train_samples, valid_samples, test_samples
  #  else:
  #    train_samples, test_samples = splitter.train_test_split(
  #        samples, train_dir=self.train_dir, test_dir=self.test_dir,
  #        frac_train=frac_train)
  #    return train_samples, test_samples

  def scaffold_test_train_valid_test_split(self):
    """Test of singletask RF ECFP regression API."""
@@ -168,10 +129,6 @@ class TestFeaturizedSamples(TestAPI):
    splitter = RandomSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)
    #train_samples, test_samples = (
    #    self._featurize_train_valid_test_split(
    #        splittype, input_file, tasks, frac_train=.8,
    #        frac_valid=0, frac_test=.2))
    assert len(train_dataset) == 8
    assert len(test_dataset) == 2

@@ -180,12 +137,6 @@ class TestFeaturizedSamples(TestAPI):
    verbosity = "high"
    data_dir = os.path.join(self.base_dir, "data")
    moved_data_dir = os.path.join(self.base_dir, "moved_data")
    ############################################## DEBUG
    print("data_dir")
    print(data_dir)
    print("moved_data_dir")
    print(moved_data_dir)
    ############################################## DEBUG
    dataset_file = os.path.join(
        self.current_dir, "example.csv")

+0 −32
Original line number Diff line number Diff line
@@ -162,46 +162,14 @@ class Model(object):
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size):
      y_pred_batch = self.predict_proba_on_batch(X_batch)
      batch_size = len(y_batch)
      ######################################################### DEBUG
      #print("Model.predict_proba()")
      #print("y_pred_batch.shape")
      #print(y_pred_batch.shape)
      ######################################################### DEBUG
      #y_pred_batch = np.squeeze(
      #    np.reshape(y_pred_batch, (batch_size, n_tasks, n_classes)))
      y_pred_batch = np.reshape(y_pred_batch, (batch_size, n_tasks, n_classes))
      ######################################################### DEBUG
      #print("reshape")
      #print("y_pred_batch.shape")
      #print(y_pred_batch.shape)
      ######################################################### DEBUG
      y_pred_batch = undo_transforms(y_pred_batch, transformers)
      ######################################################### DEBUG
      #print("untransformed")
      #print("y_pred_batch.shape")
      #print(y_pred_batch.shape)
      ######################################################### DEBUG
      y_preds.append(y_pred_batch)
    ######################################################### DEBUG
    #print("[y_pred.shape for y_pred in y_preds]")
    #print([y_pred.shape for y_pred in y_preds])
    ######################################################### DEBUG
    y_pred = np.vstack(y_preds)
    ######################################################### DEBUG
    #print("y_pred.shape")
    #print(y_pred.shape)
    ######################################################### DEBUG
    # The iterbatches does padding with zero-weight examples on the last batch.
    # Remove padded examples.
    n_samples, n_tasks = len(dataset), len(self.tasks)
    y_pred = y_pred[:n_samples]
    ######################################################### DEBUG
    #print("Model.predict_proba()")
    #print("n_samples, y_pred.shape, y_batch.shape")
    #print(n_samples, y_pred.shape, y_batch.shape)
    #print("(n_samples, n_tasks, n_classes)")
    #print((n_samples, n_tasks, n_classes))
    ######################################################### DEBUG
    y_pred = np.reshape(y_pred, (n_samples, n_tasks, n_classes))
    return y_pred

Loading