Commit 181c6938 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Fixes to more tests

parent ee2bc2a8
Loading
Loading
Loading
Loading
+34 −0
Original line number Diff line number Diff line
@@ -82,6 +82,15 @@ class Dataset(object):
      assert X.shape[0] == y.shape[0]
      assert y.shape == w.shape
      assert len(ids) == X.shape[0]
    ############################### DEBUG
    #print("Dataset.write_dataframe()")
    #print("X is None, y is None, w is None")
    #print(X is None, y is None, w is None)
    #print("basename, data_dir")
    #print(basename, data_dir)
    #print("os.path.exists(data_dir)")
    #print(os.path.exists(data_dir))
    ############################### DEBUG
    return Dataset.write_data_to_disk(data_dir, basename, tasks, X, y, w, ids)

  @staticmethod
@@ -115,8 +124,22 @@ class Dataset(object):
    out_w = "%s-w.joblib" % basename
    out_w_transformed = "%s-w-transformed.joblib" % basename
    out_ids = "%s-ids.joblib" % basename
    ############################### DEBUG
    #print("Dataset.write_data_to_disk()")
    #print("X is None, y is None, w is None")
    #print(X is None, y is None, w is None)
    ##import traceback
    ##traceback.print_stack()
    ############################### DEBUG

    if X is not None:
      ############################### DEBUG
      #print("Dataset.write_data_to_disk()")
      #print("basename")
      #print(basename)
      #print("os.path.join(data_dir, out_X)")
      #print(os.path.join(data_dir, out_X))
      ############################### DEBUG
      save_to_disk(X, os.path.join(data_dir, out_X))
      save_to_disk(X, os.path.join(data_dir, out_X_transformed))
      X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
@@ -124,6 +147,12 @@ class Dataset(object):
      save_to_disk(X_sum_squares, os.path.join(data_dir, out_X_sum_squares))
      save_to_disk(X_n, os.path.join(data_dir, out_X_n))
    if y is not None:
      ############################### DEBUG
      #print("Dataset.write_data_to_disk()")
      #print("Writing y to")
      #print("os.path.join(data_dir, out_y_transformed)")
      #print(os.path.join(data_dir, out_y_transformed))
      ############################### DEBUG
      save_to_disk(y, os.path.join(data_dir, out_y))
      save_to_disk(y, os.path.join(data_dir, out_y_transformed))
      y_sums, y_sum_squares, y_n = compute_sums_and_nb_sample(y, w)
@@ -292,6 +321,11 @@ class Dataset(object):
    """
    total = 0
    for _, row in self.metadata_df.iterrows():
      ###################################### DEBUG
      #print("Dataset.__len__()")
      #print("self.data_dir, row['y-transformed']")
      #print(self.data_dir, row['y-transformed'])
      ###################################### DEBUG
      y = load_from_disk(os.path.join(self.data_dir, row['y-transformed']))
      total += len(y)
    return total
+57 −55
Original line number Diff line number Diff line
@@ -144,12 +144,8 @@ class DataFeaturizer(object):
    self.featurizers = featurizers
    self.log_every_n = log_every_n

  def featurize(self, input_file, data_dir, shard_size=8192, worker_pool=None,
                reload=False):
  def featurize(self, input_file, data_dir, shard_size=8192, worker_pool=None):
    """Featurize provided file and write to specified location."""
    # If we are not to reload data, or data has not already been featurized.

    if not reload:
    log("Loading raw samples now.", self.verbosity)

    raw_df = load_data(input_file)
@@ -157,6 +153,9 @@ class DataFeaturizer(object):
    log("Loaded raw data frame from file.", self.verbosity)
    log("About to preprocess samples.", self.verbosity)

    if not os.path.exists(data_dir):
      os.makedirs(data_dir)

    def process_raw_sample_helper(row, fields, input_type):
      return self._process_raw_sample(input_type, row, fields)
    input_type = _get_input_type(input_file)
@@ -198,9 +197,12 @@ class DataFeaturizer(object):
                                    worker_pool=worker_pool)
      basename = "shard-%d" % j
      metadata_rows.append(write_dataframe_partial((basename, df)))
    else:
      metadata_rows = None

    ################################################## DEBUG
    print("DataFeaturizer.featurize()")
    #print("data_dir, len(metadata_rows)")
    #print(data_dir, len(metadata_rows))
    ################################################## DEBUG
    dataset = Dataset(data_dir=data_dir,
                      metadata_rows=metadata_rows,
                      reload=reload, verbosity=self.verbosity)
+131 −92
Original line number Diff line number Diff line
@@ -13,57 +13,58 @@ import os
import unittest
import tempfile
import shutil
from deepchem.datasets import Dataset
from deepchem.models.tests import TestAPI
from deepchem.splits import RandomSplitter
from deepchem.splits import ScaffoldSplitter
from deepchem.splits import SpecifiedSplitter
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.featurizers.featurize import FeaturizedSamples
#from deepchem.featurizers.featurize import FeaturizedSamples

class TestFeaturizedSamples(TestAPI):
  """
  Test Featurized Samples class.
  """
  def _featurize_train_valid_test_split(self, splittype, input_file, tasks,
                                        frac_train, frac_valid, frac_test):
    # Featurize input
    compound_featurizers = [CircularFingerprint(size=1024)]
    complex_featurizers = []
    featurizers = compound_featurizers + complex_featurizers

    input_file = os.path.join(self.current_dir, input_file)
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                compound_featurizers=compound_featurizers,
                                complex_featurizers=complex_featurizers,
                                verbosity="low")

    #Featurizes samples and transforms them into NumPy arrays suitable for ML.
    #returns an instance of class FeaturizedSamples()

    samples = featurizer.featurize(input_file, self.feature_dir, self.samples_dir)

    # Splits featurized samples into train/test
    assert splittype in ["random", "specified", "scaffold"]
    if splittype == "random":
      splitter = RandomSplitter()
    elif splittype == "specified":
      splitter = SpecifiedSplitter()
    elif splittype == "scaffold":
      splitter = ScaffoldSplitter()
    if frac_valid > 0:
      train_samples, valid_samples, test_samples = splitter.train_valid_test_split(
          samples, train_dir=self.train_dir, valid_dir=self.valid_dir,
          test_dir=self.test_dir, frac_train=frac_train,
          frac_valid=frac_valid, frac_test=frac_test)

      return train_samples, valid_samples, test_samples
    else:
      train_samples, test_samples = splitter.train_test_split(
          samples, train_dir=self.train_dir, test_dir=self.test_dir,
          frac_train=frac_train)
      return train_samples, test_samples
  #def _featurize_train_valid_test_split(self, splittype, input_file, tasks,
  #                                      frac_train, frac_valid, frac_test):
  #  # Featurize input
  #  compound_featurizers = [CircularFingerprint(size=1024)]
  #  complex_featurizers = []
  #  featurizers = compound_featurizers + complex_featurizers

  #  input_file = os.path.join(self.current_dir, input_file)
  #  featurizer = DataFeaturizer(tasks=tasks,
  #                              smiles_field=self.smiles_field,
  #                              compound_featurizers=compound_featurizers,
  #                              complex_featurizers=complex_featurizers,
  #                              verbosity="low")

  #  #Featurizes samples and transforms them into NumPy arrays suitable for ML.
  #  #returns an instance of class FeaturizedSamples()

  #  samples = featurizer.featurize(input_file, self.feature_dir, self.samples_dir)

  #  # Splits featurized samples into train/test
  #  assert splittype in ["random", "specified", "scaffold"]
  #  if splittype == "random":
  #    splitter = RandomSplitter()
  #  elif splittype == "specified":
  #    splitter = SpecifiedSplitter()
  #  elif splittype == "scaffold":
  #    splitter = ScaffoldSplitter()
  #  if frac_valid > 0:
  #    train_samples, valid_samples, test_samples = splitter.train_valid_test_split(
  #        samples, train_dir=self.train_dir, valid_dir=self.valid_dir,
  #        test_dir=self.test_dir, frac_train=frac_train,
  #        frac_valid=frac_valid, frac_test=frac_test)

  #    return train_samples, valid_samples, test_samples
  #  else:
  #    train_samples, test_samples = splitter.train_test_split(
  #        samples, train_dir=self.train_dir, test_dir=self.test_dir,
  #        frac_train=frac_train)
  #    return train_samples, test_samples

  def scaffold_test_train_valid_test_split(self):
    """Test of singletask RF ECFP regression API."""
@@ -74,14 +75,24 @@ class TestFeaturizedSamples(TestAPI):
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = "../../models/tests/example.csv"
    train_samples, valid_samples, test_samples = (
        self._featurize_train_valid_test_split(
            splittype, input_file, tasks, frac_train=.8,
            frac_valid=.1, frac_test=.1))
    assert len(train_samples) == 8
    assert len(valid_samples) == 1
    assert len(test_samples) == 1
    input_file = os.path.join(self.current_dir, "example.csv")
    featurizers = [CircularFingerprint(size=1024)]

    input_file = os.path.join(self.current_dir, input_file)
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                featurizers=featurizers,
                                verbosity="low")

    dataset = featurizer.featurize(input_file, self.data_dir)

    # Splits featurized samples into train/test
    splitter = ScaffoldSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)
    assert len(train_dataset) == 8
    assert len(valid_dataset) == 1
    assert len(test_dataset) == 1

  def scaffold_test_train_test_split(self):
    """Test of singletask RF ECFP regression API."""
@@ -92,78 +103,106 @@ class TestFeaturizedSamples(TestAPI):
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = "../../models/tests/example.csv"
    train_samples, test_samples = (
        self._featurize_train_valid_test_split(
            splittype, input_file, tasks, frac_train=.8,
            frac_valid=0, frac_test=.2))
    assert len(train_samples) == 8
    assert len(test_samples) == 2
    input_file = os.path.join(self.current_dir, "example.csv")
    featurizers = [CircularFingerprint(size=1024)]

    input_file = os.path.join(self.current_dir, input_file)
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                featurizers=featurizers,
                                verbosity="low")

    dataset = featurizer.featurize(input_file, self.data_dir)

    # Splits featurized samples into train/test
    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)
    assert len(train_dataset) == 8
    assert len(test_dataset) == 2

  def random_test_train_valid_test_split(self):
    """Test of singletask RF ECFP regression API."""
    splittype = "random"
    input_transforms = []
    output_transforms = ["normalize"]
    model_params = {}
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = "../../models/tests/example.csv"
    train_samples, valid_samples, test_samples = (
        self._featurize_train_valid_test_split(
            splittype, input_file, tasks, frac_train=.8,
            frac_valid=.1, frac_test=.1))
    assert len(train_samples) == 8
    assert len(valid_samples) == 1
    assert len(test_samples) == 1
    input_file = os.path.join(self.current_dir, "example.csv")
    featurizers = [CircularFingerprint(size=1024)]

    input_file = os.path.join(self.current_dir, input_file)
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                featurizers=featurizers,
                                verbosity="low")

    dataset = featurizer.featurize(input_file, self.data_dir)

    # Splits featurized samples into train/test
    splitter = RandomSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)
    assert len(train_dataset) == 8
    assert len(valid_dataset) == 1
    assert len(test_dataset) == 1

  def random_test_train_test_split(self):
    """Test of singletask RF ECFP regression API."""
    splittype = "random"
    input_transforms = []
    output_transforms = ["normalize"]
    #splittype = "random"
    model_params = {}
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = "../../models/tests/example.csv"
    train_samples, test_samples = (
        self._featurize_train_valid_test_split(
            splittype, input_file, tasks, frac_train=.8,
            frac_valid=0, frac_test=.2))
    assert len(train_samples) == 8
    assert len(test_samples) == 2
    input_file = os.path.join(self.current_dir, "example.csv")
    featurizers = [CircularFingerprint(size=1024)]
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                featurizers=featurizers,
                                verbosity="low")

    dataset = featurizer.featurize(input_file, self.data_dir)

    # Splits featurized samples into train/test
    splitter = RandomSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)
    #train_samples, test_samples = (
    #    self._featurize_train_valid_test_split(
    #        splittype, input_file, tasks, frac_train=.8,
    #        frac_valid=0, frac_test=.2))
    assert len(train_dataset) == 8
    assert len(test_dataset) == 2

  def test_samples_move(self):
    """Test that featurized samples can be moved and reloaded."""
    verbosity = "high"
    current_dir = os.path.dirname(os.path.realpath(__file__))
    feature_dir = os.path.join(self.base_dir, "features")
    moved_feature_dir = os.path.join(self.base_dir, "moved_features")
    samples_dir = os.path.join(self.base_dir, "samples")
    moved_samples_dir = os.path.join(self.base_dir, "moved_samples")
    data_dir = os.path.join(self.base_dir, "data")
    moved_data_dir = os.path.join(self.base_dir, "moved_data")
    ############################################## DEBUG
    print("data_dir")
    print(data_dir)
    print("moved_data_dir")
    print(moved_data_dir)
    ############################################## DEBUG
    dataset_file = os.path.join(
        current_dir, "../../models/tests/example.csv")
        self.current_dir, "example.csv")

    featurizers = [CircularFingerprint(size=1024)]
    tasks = ["log-solubility"]
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field="smiles",
                                compound_featurizers=featurizers,
                                featurizers=featurizers,
                                verbosity=verbosity)
    featurized_samples = featurizer.featurize(
        dataset_file, feature_dir,
        samples_dir, reload=reload)
    n_samples = len(featurized_samples)
    featurized_dataset = featurizer.featurize(
        dataset_file, data_dir)
    n_dataset = len(featurized_dataset)
  
    # Now perform move
    shutil.move(feature_dir, moved_feature_dir)
    shutil.move(samples_dir, moved_samples_dir)

    moved_featurized_samples = FeaturizedSamples(
        samples_dir=moved_samples_dir, featurizers=featurizers,
        reload=True)
    shutil.move(data_dir, moved_data_dir)

    assert len(moved_featurized_samples) == n_samples
    moved_featurized_dataset = Dataset(
        data_dir=moved_data_dir, reload=True)

    assert len(moved_featurized_dataset) == n_dataset