Unverified Commit 036edfe6 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1965 from deepchem/transformer_docs

Assorted Fixes
parents 9e8f9fac 74fb2716
Loading
Loading
Loading
Loading
+0 −122
Original line number Diff line number Diff line
"""
General API for testing dataset objects
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import unittest
import tempfile
import os
import shutil
import numpy as np
import deepchem as dc


def load_solubility_data():
  """Loads solubility dataset"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["log-solubility"]
  task_type = "regression"
  input_file = os.path.join(current_dir, "../../models/tests/example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)

  return loader.featurize(input_file)


def load_butina_data():
  """Loads solubility dataset"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["task"]
  # task_type = "regression"
  input_file = os.path.join(current_dir,
                            "../../models/tests/butina_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)

  return loader.featurize(input_file)


def load_multitask_data():
  """Load example multitask data."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = [
      "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7",
      "task8", "task9", "task10", "task11", "task12", "task13", "task14",
      "task15", "task16"
  ]
  input_file = os.path.join(current_dir,
                            "../../models/tests/multitask_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


def load_classification_data():
  """Loads classification data from example.csv"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["outcome"]
  task_type = "classification"
  input_file = os.path.join(current_dir,
                            "../../models/tests/example_classification.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


def load_sparse_multitask_dataset():
  """Load sparse tox multitask data, sample dataset."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = [
      "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8",
      "task9"
  ]
  input_file = os.path.join(current_dir,
                            "../../models/tests/sparse_multitask_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


def load_feat_multitask_data():
  """Load example with numerical features, tasks."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  features = ["feat0", "feat1", "feat2", "feat3", "feat4", "feat5"]
  featurizer = dc.feat.UserDefinedFeaturizer(features)
  tasks = ["task0", "task1", "task2", "task3", "task4", "task5"]
  input_file = os.path.join(current_dir,
                            "../../models/tests/feat_multitask_example.csv")
  loader = dc.data.UserCSVLoader(
      tasks=tasks, featurizer=featurizer, id_field="id")
  return loader.featurize(input_file)


def load_gaussian_cdf_data():
  """Load example with numbers sampled from Gaussian normal distribution.
     Each feature and task is a column of values that is sampled
     from a normal distribution of mean 0, stdev 1."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  features = ["feat0", "feat1"]
  featurizer = dc.feat.UserDefinedFeaturizer(features)
  tasks = ["task0", "task1"]
  input_file = os.path.join(current_dir,
                            "../../models/tests/gaussian_cdf_example.csv")
  loader = dc.data.UserCSVLoader(
      tasks=tasks, featurizer=featurizer, id_field="id")
  return loader.featurize(input_file)


def load_unlabelled_data():
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = []
  input_file = os.path.join(current_dir, "../../data/tests/no_labels.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)
+45 −16
Original line number Diff line number Diff line
@@ -24,6 +24,35 @@ except ImportError:
  PYTORCH_IMPORT_FAILED = True


def load_solubility_data():
  """Loads solubility dataset"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["log-solubility"]
  task_type = "regression"
  input_file = os.path.join(current_dir, "../../models/tests/example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)

  return loader.create_dataset(input_file)


def load_multitask_data():
  """Load example multitask data."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = [
      "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7",
      "task8", "task9", "task10", "task11", "task12", "task13", "task14",
      "task15", "task16"
  ]
  input_file = os.path.join(current_dir,
                            "../../models/tests/multitask_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


class TestDatasets(test_util.TensorFlowTestCase):
  """
  Test basic top-level API for dataset objects.
@@ -172,10 +201,10 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_get_task_names(self):
    """Test that get_task_names returns correct task_names"""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    assert solubility_dataset.get_task_names() == ["log-solubility"]

    multitask_dataset = dc.data.tests.load_multitask_data()
    multitask_dataset = load_multitask_data()
    assert sorted(multitask_dataset.get_task_names()) == sorted([
        "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7",
        "task8", "task9", "task10", "task11", "task12", "task13", "task14",
@@ -184,20 +213,20 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_get_data_shape(self):
    """Test that get_data_shape returns currect data shape"""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    assert solubility_dataset.get_data_shape() == (1024,)

    multitask_dataset = dc.data.tests.load_multitask_data()
    multitask_dataset = load_multitask_data()
    assert multitask_dataset.get_data_shape() == (1024,)

  def test_len(self):
    """Test that len(dataset) works."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    assert len(solubility_dataset) == 10

  def test_reshard(self):
    """Test that resharding the dataset works."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
                    solubility_dataset.w, solubility_dataset.ids)
    assert solubility_dataset.get_number_shards() == 1
@@ -302,7 +331,7 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_iterbatches(self):
    """Test that iterating over batches of data works."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    batch_size = 2
    data_shape = solubility_dataset.get_data_shape()
    tasks = solubility_dataset.get_task_names()
@@ -331,7 +360,7 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_itersamples_disk(self):
    """Test that iterating over samples in a DiskDataset works."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    X = solubility_dataset.X
    y = solubility_dataset.y
    w = solubility_dataset.w
@@ -372,7 +401,7 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_transform_disk(self):
    """Test that the transform() method works for DiskDatasets."""
    dataset = dc.data.tests.load_solubility_data()
    dataset = load_solubility_data()
    X = dataset.X
    y = dataset.y
    w = dataset.w
@@ -394,7 +423,7 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_to_numpy(self):
    """Test that transformation to numpy arrays is sensible."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    data_shape = solubility_dataset.get_data_shape()
    tasks = solubility_dataset.get_task_names()
    X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
@@ -409,7 +438,7 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_consistent_ordering(self):
    """Test that ordering of labels is consistent over time."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()

    ids1 = solubility_dataset.ids
    ids2 = solubility_dataset.ids
@@ -418,7 +447,7 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_get_statistics(self):
    """Test statistics computation of this dataset."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
                  solubility_dataset.w, solubility_dataset.ids)
    X_means, y_means = np.mean(X, axis=0), np.mean(y, axis=0)
@@ -431,7 +460,7 @@ class TestDatasets(test_util.TensorFlowTestCase):
    np.testing.assert_allclose(comp_y_stds, y_stds)

  def test_disk_iterate_batch_size(self):
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
                  solubility_dataset.w, solubility_dataset.ids)
    batch_sizes = []
@@ -656,7 +685,7 @@ class TestDatasets(test_util.TensorFlowTestCase):
          np.sort(all_ids, axis=0), np.sort(test_ids, axis=0))

  def test_numpy_iterate_batch_size(self):
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
                  solubility_dataset.w, solubility_dataset.ids)
    solubility_dataset = dc.data.NumpyDataset.from_DiskDataset(
@@ -769,12 +798,12 @@ class TestDatasets(test_util.TensorFlowTestCase):
  @unittest.skipIf(PYTORCH_IMPORT_FAILED, 'PyTorch is not installed')
  def test_make_pytorch_dataset_from_disk(self):
    """Test creating a PyTorch Dataset from a DiskDataset."""
    dataset = dc.data.tests.load_solubility_data()
    dataset = load_solubility_data()
    self._validate_pytorch_dataset(dataset)

  def test_dataframe(self):
    """Test converting between Datasets and DataFrames."""
    dataset = dc.data.tests.load_solubility_data()
    dataset = load_solubility_data()

    # A round trip from Dataset to DataFrame to Dataset should produce identical arrays.

+58 −74
Original line number Diff line number Diff line
@@ -13,12 +13,15 @@ import pandas as pd
import itertools
import os
import deepchem as dc
import logging
from deepchem.data import DiskDataset
from deepchem.utils import ScaffoldGenerator
from deepchem.utils.save import log
from deepchem.data import NumpyDataset
from deepchem.utils.save import load_data

logger = logging.getLogger(__name__)


def generate_scaffold(smiles, include_chirality=False):
  """Compute the Bemis-Murcko scaffold for a SMILES string."""
@@ -44,10 +47,6 @@ class Splitter(object):
    Abstract base class for chemically aware splits..
    """

  def __init__(self, verbose=False):
    """Creates splitter object."""
    self.verbose = verbose

  def k_fold_split(self, dataset, k, directories=None, **kwargs):
    """
    Parameters
@@ -75,7 +74,7 @@ class Splitter(object):
    :param kwargs:
    :return: list of length k tuples of (train, cv)
    """
    log("Computing K-fold split", self.verbose)
    logger.info("Computing K-fold split")
    if directories is None:
      directories = [tempfile.mkdtemp() for _ in range(2 * k)]
    else:
@@ -125,14 +124,12 @@ class Splitter(object):
                             frac_test=.1,
                             seed=None,
                             log_every_n=1000,
                             verbose=True,
                             **kwargs):
    """
        Splits self into train/validation/test sets.
    """ Splits self into train/validation/test sets.

    Returns Dataset objects.
    """
    log("Computing train/valid/test indices", self.verbose)
    logger.info("Computing train/valid/test indices")
    train_inds, valid_inds, test_inds = self.split(
        dataset,
        seed=seed,
@@ -164,10 +161,8 @@ class Splitter(object):
                       test_dir=None,
                       seed=None,
                       frac_train=.8,
                       verbose=True,
                       **kwargs):
    """
        Splits self into train/test sets.
    """Splits self into train/test sets.
    Returns Dataset objects.
    """
    valid_dir = tempfile.mkdtemp()
@@ -180,7 +175,6 @@ class Splitter(object):
        frac_test=1 - frac_train,
        frac_valid=0.,
        seed=seed,
        verbose=verbose,
        **kwargs)
    return train_dataset, test_dataset

@@ -191,7 +185,6 @@ class Splitter(object):
            frac_valid=None,
            frac_test=None,
            log_every_n=None,
            verbose=False,
            **kwargs):
    """
    Stub to be filled in by child classes.
@@ -400,7 +393,7 @@ class RandomStratifiedSplitter(Splitter):

  def k_fold_split(self, dataset, k, directories=None, **kwargs):
    """Needs custom implementation due to ragged splits for stratification."""
    log("Computing K-fold split", self.verbose)
    logger.info("Computing K-fold split")
    if directories is None:
      directories = [tempfile.mkdtemp() for _ in range(k)]
    else:
@@ -433,12 +426,12 @@ class SingletaskStratifiedSplitter(Splitter):
    >>> y = np.random.rand(n_samples, n_tasks)
    >>> w = np.ones_like(y)
    >>> dataset = DiskDataset.from_numpy(np.ones((100,n_tasks)), np.ones((100,n_tasks)))
    >>> splitter = SingletaskStratifiedSplitter(task_number=5, verbose=False)
    >>> splitter = SingletaskStratifiedSplitter(task_number=5)
    >>> train_dataset, test_dataset = splitter.train_test_split(dataset)

    """

  def __init__(self, task_number=0, verbose=False):
  def __init__(self, task_number=0):
    """
    Creates splitter object.

@@ -446,11 +439,8 @@ class SingletaskStratifiedSplitter(Splitter):
    ----------
    task_number: int (Optional, Default 0)
      Task number for stratification.
        verbose: bool (Optional, Default False)
          Controls logging frequency.
    """
    self.task_number = task_number
    self.verbose = verbose

  def k_fold_split(self,
                   dataset,
@@ -479,7 +469,7 @@ class SingletaskStratifiedSplitter(Splitter):
    fold_datasets: List
      List containing dc.data.Dataset objects
    """
    log("Computing K-fold split", self.verbose)
    logger.info("Computing K-fold split")
    if directories is None:
      directories = [tempfile.mkdtemp() for _ in range(k)]
    else:
@@ -731,7 +721,7 @@ class IndiceSplitter(Splitter):
    Class for splits based on input order.
    """

  def __init__(self, verbose=False, valid_indices=None, test_indices=None):
  def __init__(self, valid_indices=None, test_indices=None):
    """
    Parameters
    -----------
@@ -740,7 +730,6 @@ class IndiceSplitter(Splitter):
    test_indices: list of int
        indices of samples in the test set
    """
    self.verbose = verbose
    self.valid_indices = valid_indices
    self.test_indices = test_indices

@@ -866,7 +855,7 @@ class ScaffoldSplitter(Splitter):
    valid_cutoff = (frac_train + frac_valid) * len(dataset)
    train_inds, valid_inds, test_inds = [], [], []

    log("About to sort in scaffold sets", self.verbose)
    logger.info("About to sort in scaffold sets")
    for scaffold_set in scaffold_sets:
      if len(train_inds) + len(scaffold_set) > train_cutoff:
        if len(train_inds) + len(valid_inds) + len(scaffold_set) > valid_cutoff:
@@ -884,10 +873,10 @@ class ScaffoldSplitter(Splitter):
    scaffolds = {}
    data_len = len(dataset)

    log("About to generate scaffolds", self.verbose)
    logger.info("About to generate scaffolds")
    for ind, smiles in enumerate(dataset.ids):
      if ind % log_every_n == 0:
        log("Generating scaffold %d/%d" % (ind, data_len), self.verbose)
        logger.info("Generating scaffold %d/%d" % (ind, data_len))
      scaffold = generate_scaffold(smiles)
      if scaffold not in scaffolds:
        scaffolds[scaffold] = [ind]
@@ -995,11 +984,10 @@ class SpecifiedSplitter(Splitter):
  Class that splits data according to user specification.
  """

  def __init__(self, input_file, split_field, verbose=False):
  def __init__(self, input_file, split_field):
    """Provide input information for splits."""
    raw_df = next(load_data([input_file], shard_size=None))
    self.splits = raw_df[split_field].values
    self.verbose = verbose

  def split(self,
            dataset,
@@ -1030,13 +1018,11 @@ class SpecifiedIndexSplitter(Splitter):
  Class that splits data according to user index specification
  """

  def __init__(self, train_inds, valid_inds, test_inds, verbose=False):
  def __init__(self, train_inds, valid_inds, test_inds):
    """Provide input information for splits."""
    self.train_inds = train_inds
    self.valid_inds = valid_inds
    self.test_inds = test_inds
    self.verbose = verbose
    super(SpecifiedIndexSplitter, self).__init__(verbose)

  def split(self,
            dataset,
@@ -1044,8 +1030,7 @@ class SpecifiedIndexSplitter(Splitter):
            frac_train=.8,
            frac_valid=.1,
            frac_test=.1,
            log_every_n=1000,
            verbose=False):
            log_every_n=1000):
    """
    Splits internal compounds into train/validation/test by user-specification.
    """
@@ -1054,10 +1039,9 @@ class SpecifiedIndexSplitter(Splitter):

class TimeSplitterPDBbind(Splitter):

  def __init__(self, ids, year_file=None, verbose=False):
  def __init__(self, ids, year_file=None):
    self.ids = ids
    self.year_file = year_file
    self.verbose = verbose

  def split(self,
            dataset,
+76 −17
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@ __author__ = "Bharath Ramsundar, Aneesh Pappu"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import os
import tempfile
import unittest
import numpy as np
@@ -13,13 +14,71 @@ from deepchem.data import NumpyDataset
from deepchem.splits import IndexSplitter


def load_sparse_multitask_dataset():
  """Load sparse tox multitask data, sample dataset."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = [
      "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8",
      "task9"
  ]
  input_file = os.path.join(current_dir,
                            "../../models/tests/sparse_multitask_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


def load_multitask_data():
  """Load example multitask data."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = [
      "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7",
      "task8", "task9", "task10", "task11", "task12", "task13", "task14",
      "task15", "task16"
  ]
  input_file = os.path.join(current_dir,
                            "../../models/tests/multitask_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


def load_solubility_data():
  """Loads solubility dataset"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["log-solubility"]
  task_type = "regression"
  input_file = os.path.join(current_dir, "../../models/tests/example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)

  return loader.featurize(input_file)


def load_butina_data():
  """Loads solubility dataset"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["task"]
  # task_type = "regression"
  input_file = os.path.join(current_dir,
                            "../../models/tests/butina_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)

  return loader.featurize(input_file)


class TestSplitter(unittest.TestCase):
  """
  Test some basic splitters.
  """

  def test_random_group_split(self):
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()

    groups = [0, 4, 1, 2, 3, 7, 0, 3, 1, 0]
    # 0 1 2 3 4 5 6 7 8 9
@@ -48,7 +107,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask RandomSplitter class.
    """
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    random_splitter = dc.splits.RandomSplitter()
    train_data, valid_data, test_data = \
      random_splitter.train_valid_test_split(
@@ -65,7 +124,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask IndexSplitter class.
    """
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    random_splitter = dc.splits.IndexSplitter()
    train_data, valid_data, test_data = \
      random_splitter.train_valid_test_split(
@@ -86,7 +145,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask ScaffoldSplitter class.
    """
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    scaffold_splitter = dc.splits.ScaffoldSplitter()
    train_data, valid_data, test_data = \
      scaffold_splitter.train_valid_test_split(
@@ -99,7 +158,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask Fingerprint class.
    """
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    assert (len(solubility_dataset.X) == 10)
    scaffold_splitter = dc.splits.FingerprintSplitter()
    train_data, valid_data, test_data = \
@@ -116,7 +175,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask SingletaskStratifiedSplitter class.
    """
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    stratified_splitter = dc.splits.ScaffoldSplitter()
    train_data, valid_data, test_data = \
      stratified_splitter.train_valid_test_split(
@@ -133,7 +192,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask MaxMinSplitter class.
    """
    solubility_dataset = dc.data.tests.load_butina_data()
    solubility_dataset = load_butina_data()
    maxmin_splitter = dc.splits.MaxMinSplitter()
    train_data, valid_data, test_data = \
      maxmin_splitter.train_valid_test_split(
@@ -146,7 +205,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask ButinaSplitter class.
    """
    solubility_dataset = dc.data.tests.load_butina_data()
    solubility_dataset = load_butina_data()
    butina_splitter = dc.splits.ButinaSplitter()
    train_data, valid_data, test_data = \
      butina_splitter.train_valid_test_split(
@@ -177,7 +236,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask RandomSplitter class.
    """
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    random_splitter = dc.splits.RandomSplitter()
    ids_set = set(solubility_dataset.ids)

@@ -202,7 +261,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask IndexSplitter class.
    """
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    index_splitter = dc.splits.IndexSplitter()
    ids_set = set(solubility_dataset.ids)

@@ -232,7 +291,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask ScaffoldSplitter class.
    """
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    scaffold_splitter = dc.splits.ScaffoldSplitter()
    ids_set = set(solubility_dataset.ids)

@@ -469,7 +528,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test multitask RandomSplitter class.
    """
    multitask_dataset = dc.data.tests.load_multitask_data()
    multitask_dataset = load_multitask_data()
    random_splitter = dc.splits.RandomSplitter()
    train_data, valid_data, test_data = \
      random_splitter.train_valid_test_split(
@@ -482,7 +541,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test multitask IndexSplitter class.
    """
    multitask_dataset = dc.data.tests.load_multitask_data()
    multitask_dataset = load_multitask_data()
    index_splitter = dc.splits.IndexSplitter()
    train_data, valid_data, test_data = \
      index_splitter.train_valid_test_split(
@@ -495,7 +554,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test multitask ScaffoldSplitter class.
    """
    multitask_dataset = dc.data.tests.load_multitask_data()
    multitask_dataset = load_multitask_data()
    scaffold_splitter = dc.splits.ScaffoldSplitter()
    train_data, valid_data, test_data = \
      scaffold_splitter.train_valid_test_split(
@@ -511,7 +570,7 @@ class TestSplitter(unittest.TestCase):
    # sparsity is determined by number of w weights that are 0 for a given
    # task structure of w np array is such that each row corresponds to a
    # sample. The loaded sparse dataset has many rows with only zeros
    sparse_dataset = dc.data.tests.load_sparse_multitask_dataset()
    sparse_dataset = load_sparse_multitask_dataset()

    stratified_splitter = dc.splits.RandomStratifiedSplitter()
    datasets = stratified_splitter.train_valid_test_split(
@@ -526,7 +585,7 @@ class TestSplitter(unittest.TestCase):

  def test_indice_split(self):

    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    random_splitter = dc.splits.IndiceSplitter(
        valid_indices=[7], test_indices=[8])
    train_data, valid_data, test_data = \
@@ -538,7 +597,7 @@ class TestSplitter(unittest.TestCase):

  def test_random_seed(self):
    """Test that splitters use the random seed correctly."""
    dataset = dc.data.tests.load_solubility_data()
    dataset = load_solubility_data()
    splitter = dc.splits.RandomSplitter()
    train1, valid1, test1 = splitter.train_valid_test_split(dataset, seed=1)
    train2, valid2, test2 = splitter.train_valid_test_split(dataset, seed=2)
+1 −0
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@ Gathers all transformers in one place for convenient imports
"""
from deepchem.trans.transformers import undo_transforms
from deepchem.trans.transformers import undo_grad_transforms
from deepchem.trans.transformers import Transformer
from deepchem.trans.transformers import LogTransformer
from deepchem.trans.transformers import ClippingTransformer
from deepchem.trans.transformers import NormalizationTransformer
Loading