Commit 4f6d254e authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Splits on datasets

parent aa6e3b12
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -234,6 +234,15 @@ class Dataset(object):
    raw_data = (ids, X, y, w)
    return Dataset(data_dir=data_dir, tasks=tasks, raw_data=raw_data)

  def select(self, select_dir, indices):
    """Creates a new dataset from a selection of indices from self."""
    indices = np.array(indices)
    X, y, w, ids = self.to_numpy()
    tasks = self.get_task_names()
    X_sel, y_sel, w_sel, ids_sel = (
        X[indices], y[indices], w[indices], ids[indices])
    return Dataset.from_numpy(select_dir, X_sel, y_sel, w_sel, ids_sel, tasks)
    
  def to_numpy(self):
    """
    Transforms internal data into arrays X, y, w
+6 −4
Original line number Diff line number Diff line
@@ -18,7 +18,7 @@ from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import NormalizationTransformer
from deepchem.splits.tests import TestSplitAPI
from deepchem.models.tests import TestAPI

class TestDatasetAPI(TestAPI):
  """
@@ -30,7 +30,7 @@ class TestDatasetAPI(TestAPI):
    featurizers = [CircularFingerprint(size=1024)]
    tasks = ["log-solubility"]
    task_type = "regression"
    input_file = os.path.join(self.test_data_dir, "example.csv")
    input_file = os.path.join(self.current_dir, "../../models/tests/example.csv")
    featurizer = DataFeaturizer(
        tasks=tasks,
        smiles_field=self.smiles_field,
@@ -44,7 +44,8 @@ class TestDatasetAPI(TestAPI):
    featurizers = [CircularFingerprint(size=1024)]
    tasks = ["outcome"]
    task_type = "classification"
    input_file = os.path.join(self.test_data_dir, "example_classification.csv")
    input_file = os.path.join(
        self.current_dir, "../../models/tests/example_classification.csv")
    featurizer = DataFeaturizer(
        tasks=tasks,
        smiles_field=self.smiles_field,
@@ -58,7 +59,8 @@ class TestDatasetAPI(TestAPI):
    tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
             "task7", "task8", "task9", "task10", "task11", "task12",
             "task13", "task14", "task15", "task16"]
    input_file = os.path.join(self.test_data_dir, "multitask_example.csv")
    input_file = os.path.join(
        self.current_dir, "../../models/tests/multitask_example.csv")
    featurizer = DataFeaturizer(
        tasks=tasks,
        smiles_field=self.smiles_field,
+4 −17
Original line number Diff line number Diff line
@@ -53,8 +53,7 @@ class TestAPI(unittest.TestCase):
    # debug.
    #shutil.rmtree(self.model_dir)

  def _featurize_train_test_split(self, splittype, compound_featurizers, 
                                  complex_featurizers,
  def _featurize_train_test_split(self, splittype, featurizers, 
                                  input_transformer_classes,
                                  output_transformer_classes, input_file, tasks, 
                                  protein_pdb_field=None, ligand_pdb_field=None,
@@ -62,15 +61,12 @@ class TestAPI(unittest.TestCase):
                                  split_field=None,
                                  shard_size=100):
    # Featurize input
    featurizers = compound_featurizers + complex_featurizers

    input_file = os.path.join(self.current_dir, input_file)
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                protein_pdb_field=protein_pdb_field,
                                ligand_pdb_field=ligand_pdb_field,
                                compound_featurizers=compound_featurizers,
                                complex_featurizers=complex_featurizers,
                                featurizers=featurizers,
                                user_specified_features=user_specified_features,
                                split_field=split_field,
                                verbosity="low")
@@ -79,8 +75,7 @@ class TestAPI(unittest.TestCase):
    #Featurizes samples and transforms them into NumPy arrays suitable for ML.
    #returns an instance of class FeaturizedSamples()

    samples = featurizer.featurize(input_file, self.feature_dir, self.samples_dir,
                                   shard_size=shard_size)
    dataset = featurizer.featurize(input_file, self.data_dir, shard_size=shard_size)

    # Splits featurized samples into train/test
    assert splittype in ["random", "specified", "scaffold"]
@@ -90,17 +85,9 @@ class TestAPI(unittest.TestCase):
      splitter = SpecifiedSplitter()
    elif splittype == "scaffold":
      splitter = ScaffoldSplitter()
    train_samples, test_samples = splitter.train_test_split(
    train_dataset, test_dataset = splitter.train_test_split(
        samples, self.train_dir, self.test_dir)

    use_user_specified_features = (user_specified_features is not None)
    train_dataset = Dataset(data_dir=self.train_dir, samples=train_samples, 
                            featurizers=featurizers, tasks=tasks,
                            use_user_specified_features=use_user_specified_features)
    test_dataset = Dataset(data_dir=self.test_dir, samples=test_samples, 
                           featurizers=featurizers, tasks=tasks,
                           use_user_specified_features=use_user_specified_features)

    # Initialize transformers
    input_transformers = []
    for transform_class in input_transformer_classes:
+17 −30
Original line number Diff line number Diff line
@@ -14,7 +14,6 @@ import unittest
import tempfile
import shutil
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.featurize import FeaturizedSamples
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.featurizers.basic import RDKitDescriptors
from deepchem.featurizers.grid_featurizer import GridFeaturizer
@@ -39,8 +38,7 @@ class TestModelAPI(TestAPI):
  def test_singletask_sklearn_rf_ECFP_regression_API(self):
    """Test of singletask RF ECFP regression API."""
    splittype = "scaffold"
    compound_featurizers = [CircularFingerprint(size=1024)]
    complex_featurizers = []
    featurizers = [CircularFingerprint(size=1024)]
    input_transformers = []
    output_transformers = [NormalizationTransformer]
    model_params = {}
@@ -49,7 +47,7 @@ class TestModelAPI(TestAPI):
    task_types = {task: task_type for task in tasks}
    input_file = "example.csv"
    train_dataset, test_dataset, _, transformers, = self._featurize_train_test_split(
        splittype, compound_featurizers, 
        splittype, featurizers, 
        complex_featurizers, input_transformers,
        output_transformers, input_file, tasks)
    model_params["data_shape"] = train_dataset.get_data_shape()
@@ -77,8 +75,7 @@ class TestModelAPI(TestAPI):
    """Test of singletask RF USF regression API."""
    splittype = "specified"
    split_field = "split"
    compound_featurizers = []
    complex_featurizers = []
    featurizers = []
    input_transformers = []
    output_transformers = [NormalizationTransformer]
    model_params = {}
@@ -88,9 +85,8 @@ class TestModelAPI(TestAPI):
    input_file = "user_specified_example.csv"
    user_specified_features = ["user-specified1", "user-specified2"]
    train_dataset, test_dataset, _, transformers, = self._featurize_train_test_split(
        splittype, compound_featurizers, 
        complex_featurizers, input_transformers,
        output_transformers, input_file, tasks,
        splittype, featurizers, 
        input_transformers, output_transformers, input_file, tasks,
        user_specified_features=user_specified_features,
        split_field=split_field)
    model_params["data_shape"] = train_dataset.get_data_shape()
@@ -117,8 +113,7 @@ class TestModelAPI(TestAPI):
  def test_singletask_sklearn_rf_ECFP_regression_sharded_API(self):
    """Test of singletask RF ECFP regression API: sharded edition."""
    splittype = "scaffold"
    compound_featurizers = [CircularFingerprint(size=1024)]
    complex_featurizers = []
    featurizers = [CircularFingerprint(size=1024)]
    input_transformers = []
    output_transformers = [NormalizationTransformer]
    model_params = {}
@@ -127,10 +122,8 @@ class TestModelAPI(TestAPI):
    task_types = {task: task_type for task in tasks}
    input_file = "../../../datasets/pdbbind_core_df.pkl.gz"
    train_dataset, test_dataset, _, transformers = self._featurize_train_test_split(
        splittype, compound_featurizers, 
        complex_featurizers, input_transformers,
        output_transformers, input_file, tasks,
        shard_size=50)
        splittype, featurizers, input_transformers, output_transformers,
        input_file, tasks, shard_size=50)
    # We set shard size above to force the creation of multiple shards of the data.
    # pdbbind_core has ~200 examples.
    model_params["data_shape"] = train_dataset.get_data_shape()
@@ -157,8 +150,7 @@ class TestModelAPI(TestAPI):
  def test_singletask_sklearn_rf_RDKIT_descriptor_regression_API(self):
    """Test of singletask RF RDKIT-descriptor regression API."""
    splittype = "scaffold"
    compound_featurizers = [RDKitDescriptors()]
    complex_featurizers = []
    featurizers = [RDKitDescriptors()]
    input_transformers = [NormalizationTransformer, ClippingTransformer]
    output_transformers = [NormalizationTransformer]
    tasks = ["log-solubility"]
@@ -167,9 +159,8 @@ class TestModelAPI(TestAPI):
    model_params = {}
    input_file = "example.csv"
    train_dataset, test_dataset, _, transformers = self._featurize_train_test_split(
        splittype, compound_featurizers, 
        complex_featurizers, input_transformers,
        output_transformers, input_file, tasks)
        splittype, featurizers, input_transformers, output_transformers,
        input_file, tasks)
    model_params["data_shape"] = train_dataset.get_data_shape()
    regression_metrics = [Metric(metrics.r2_score),
                          Metric(metrics.mean_squared_error),
@@ -195,8 +186,7 @@ class TestModelAPI(TestAPI):
    """Test of singletask MLP User Specified Features regression API."""
    from deepchem.models.keras_models.fcnet import SingleTaskDNN
    splittype = "scaffold"
    compound_featurizers = []
    complex_featurizers = []
    featurizers = []
    input_transformers = [NormalizationTransformer, ClippingTransformer]
    output_transformers = [NormalizationTransformer]
    feature_types = ["user_specified_features"]
@@ -215,7 +205,7 @@ class TestModelAPI(TestAPI):
    protein_pdb_field = None
    ligand_pdb_field = None
    train_dataset, test_dataset, _, transformers = self._featurize_train_test_split(
        splittype, compound_featurizers,
        splittype, featurizers,
        complex_featurizers, input_transformers,
        output_transformers, input_file, tasks,
        protein_pdb_field=protein_pdb_field,
@@ -263,12 +253,10 @@ class TestModelAPI(TestAPI):
             "task13", "task14", "task15", "task16"]
    task_types = {task: task_type for task in tasks}

    compound_featurizers = [CircularFingerprint(size=1024)]
    complex_featurizers = []
    featurizers = [CircularFingerprint(size=1024)]

    train_dataset, test_dataset, _, transformers = self._featurize_train_test_split(
        splittype, compound_featurizers, 
        complex_featurizers, input_transformers,
        splittype, featurizers, input_transformers,
        output_transformers, input_file, tasks)
    model_params["data_shape"] = train_dataset.get_data_shape()
    classification_metrics = [Metric(metrics.roc_auc_score),
@@ -297,8 +285,7 @@ class TestModelAPI(TestAPI):
    input_transformers = []
    task_type = "classification"

    compound_featurizers = [CircularFingerprint(size=1024)]
    complex_featurizers = []
    featurizers = [CircularFingerprint(size=1024)]

    tasks = ["outcome"]
    task_type = "classification"
@@ -308,7 +295,7 @@ class TestModelAPI(TestAPI):
    output_transformers = [NormalizationTransformer]

    train_dataset, test_dataset, _, transformers = self._featurize_train_test_split(
        splittype, compound_featurizers, 
        splittype, featurizers, 
        complex_featurizers, input_transformers,
        output_transformers, input_file, tasks)

+15 −36
Original line number Diff line number Diff line
@@ -32,52 +32,31 @@ class Splitter(object):
    """Creates splitter object."""
    self.verbosity = verbosity

  def train_valid_test_split(self, datset, train_dir,
  def train_valid_test_split(self, dataset, train_dir,
                             valid_dir, test_dir, frac_train=.8,
                             frac_valid=.1, frac_test=.1, seed=None,
                             log_every_n=1000, reload=False):
                             log_every_n=1000):
    """
    Splits self into train/validation/test sets.

    Returns Dataset objects.
    """
    if not reload:
    log("Computing train/valid/test indices", self.verbosity)
    train_inds, valid_inds, test_inds = self.split(
        dataset,
        frac_train=frac_train, frac_test=frac_test,
        frac_valid=frac_valid, log_every_n=log_every_n)

    # Generate train dir
    train_samples = Dataset(samples_dir=train_dir, 
                            dataset_files=dataset_files,
                            featurizers=samples.featurizers,
                            verbosity=self.verbosity,
                            reload=reload)
    if compute_split:
      train_samples._set_compound_df(samples.compounds_df.iloc[train_inds])
    # Generate test dir
    test_samples = Dataset(samples_dir=test_dir, 
                           dataset_files=dataset_files,
                           featurizers=samples.featurizers,
                           verbosity=self.verbosity,
                           reload=reload)
    if compute_split:
      test_samples._set_compound_df(samples.compounds_df.iloc[test_inds])
    # if requested, generated valid_dir
    train_dataset = dataset.select(train_dir, train_inds)
    if valid_dir is not None:
      valid_samples = Dataset(samples_dir=valid_dir, 
                              dataset_files=dataset_files,
                              featurizers=samples.featurizers,
                              verbosity=self.verbosity,
                              reload=reload)
      if compute_split:
        valid_samples._set_compound_df(samples.compounds_df.iloc[valid_inds])
      valid_dataset = dataset.select(valid_dir, valid_inds)
    else:
      valid_dataset = None
    test_dataset = dataset.select(test_dir, test_inds)

    return train_samples, valid_samples, test_samples
    return train_dataset, valid_dataset, test_dataset

  def train_test_split(self, samples, train_dir, test_dir, seed=None,
                       frac_train=.8, reload=False):
                       frac_train=.8):
    """
    Splits self into train/test sets.

@@ -157,7 +136,7 @@ class ScaffoldSplitter(Splitter):
    scaffolds = {}
    log("About to generate scaffolds", self.verbosity)
    data_len = len(dataset)
    for smiles in dataset.get_ids():
    for ind, smiles in enumerate(dataset.get_ids()):
      if self.verbosity is not None and ind % log_every_n == 0:
        log("Generating scaffold %d/%d" % (ind, data_len), self.verbosity)
      scaffold = generate_scaffold(smiles)
Loading