Commit 19029da8 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Fixes

parent 4e3f3189
Loading
Loading
Loading
Loading
+0 −122
Original line number Diff line number Diff line
"""
General API for testing dataset objects
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import unittest
import tempfile
import os
import shutil
import numpy as np
import deepchem as dc


def load_solubility_data():
  """Loads solubility dataset"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["log-solubility"]
  task_type = "regression"
  input_file = os.path.join(current_dir, "../../models/tests/example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)

  return loader.featurize(input_file)


def load_butina_data():
  """Loads solubility dataset"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["task"]
  # task_type = "regression"
  input_file = os.path.join(current_dir,
                            "../../models/tests/butina_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)

  return loader.featurize(input_file)


def load_multitask_data():
  """Load example multitask data."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = [
      "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7",
      "task8", "task9", "task10", "task11", "task12", "task13", "task14",
      "task15", "task16"
  ]
  input_file = os.path.join(current_dir,
                            "../../models/tests/multitask_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


def load_classification_data():
  """Loads classification data from example.csv"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["outcome"]
  task_type = "classification"
  input_file = os.path.join(current_dir,
                            "../../models/tests/example_classification.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


def load_sparse_multitask_dataset():
  """Load sparse tox multitask data, sample dataset."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = [
      "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8",
      "task9"
  ]
  input_file = os.path.join(current_dir,
                            "../../models/tests/sparse_multitask_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


def load_feat_multitask_data():
  """Load example with numerical features, tasks."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  features = ["feat0", "feat1", "feat2", "feat3", "feat4", "feat5"]
  featurizer = dc.feat.UserDefinedFeaturizer(features)
  tasks = ["task0", "task1", "task2", "task3", "task4", "task5"]
  input_file = os.path.join(current_dir,
                            "../../models/tests/feat_multitask_example.csv")
  loader = dc.data.UserCSVLoader(
      tasks=tasks, featurizer=featurizer, id_field="id")
  return loader.featurize(input_file)


def load_gaussian_cdf_data():
  """Load example with numbers sampled from Gaussian normal distribution.
     Each feature and task is a column of values that is sampled
     from a normal distribution of mean 0, stdev 1."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  features = ["feat0", "feat1"]
  featurizer = dc.feat.UserDefinedFeaturizer(features)
  tasks = ["task0", "task1"]
  input_file = os.path.join(current_dir,
                            "../../models/tests/gaussian_cdf_example.csv")
  loader = dc.data.UserCSVLoader(
      tasks=tasks, featurizer=featurizer, id_field="id")
  return loader.featurize(input_file)


def load_unlabelled_data():
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = []
  input_file = os.path.join(current_dir, "../../data/tests/no_labels.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)
+38 −9
Original line number Diff line number Diff line
@@ -24,6 +24,35 @@ except ImportError:
  PYTORCH_IMPORT_FAILED = True


def load_solubility_data():
  """Loads solubility dataset"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["log-solubility"]
  task_type = "regression"
  input_file = os.path.join(current_dir, "../../models/tests/example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)

  return loader.create_dataset(input_file)


def load_multitask_data():
  """Load example multitask data."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = [
      "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7",
      "task8", "task9", "task10", "task11", "task12", "task13", "task14",
      "task15", "task16"
  ]
  input_file = os.path.join(current_dir,
                            "../../models/tests/multitask_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


class TestDatasets(test_util.TensorFlowTestCase):
  """
  Test basic top-level API for dataset objects.
@@ -172,10 +201,10 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_get_task_names(self):
    """Test that get_task_names returns correct task_names"""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    assert solubility_dataset.get_task_names() == ["log-solubility"]

    multitask_dataset = dc.data.tests.load_multitask_data()
    multitask_dataset = load_multitask_data()
    assert sorted(multitask_dataset.get_task_names()) == sorted([
        "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7",
        "task8", "task9", "task10", "task11", "task12", "task13", "task14",
@@ -184,20 +213,20 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_get_data_shape(self):
    """Test that get_data_shape returns currect data shape"""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    assert solubility_dataset.get_data_shape() == (1024,)

    multitask_dataset = dc.data.tests.load_multitask_data()
    multitask_dataset = load_multitask_data()
    assert multitask_dataset.get_data_shape() == (1024,)

  def test_len(self):
    """Test that len(dataset) works."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    assert len(solubility_dataset) == 10

  def test_reshard(self):
    """Test that resharding the dataset works."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
                    solubility_dataset.w, solubility_dataset.ids)
    assert solubility_dataset.get_number_shards() == 1
@@ -302,7 +331,7 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_iterbatches(self):
    """Test that iterating over batches of data works."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    batch_size = 2
    data_shape = solubility_dataset.get_data_shape()
    tasks = solubility_dataset.get_task_names()
@@ -331,7 +360,7 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_itersamples_disk(self):
    """Test that iterating over samples in a DiskDataset works."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    X = solubility_dataset.X
    y = solubility_dataset.y
    w = solubility_dataset.w
@@ -372,7 +401,7 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_transform_disk(self):
    """Test that the transform() method works for DiskDatasets."""
    dataset = dc.data.tests.load_solubility_data()
    dataset = load_solubility_data()
    X = dataset.X
    y = dataset.y
    w = dataset.w
+75 −17
Original line number Diff line number Diff line
@@ -13,13 +13,71 @@ from deepchem.data import NumpyDataset
from deepchem.splits import IndexSplitter


def load_sparse_multitask_dataset():
  """Load sparse tox multitask data, sample dataset."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = [
      "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8",
      "task9"
  ]
  input_file = os.path.join(current_dir,
                            "../../models/tests/sparse_multitask_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


def load_multitask_data():
  """Load example multitask data."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = [
      "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7",
      "task8", "task9", "task10", "task11", "task12", "task13", "task14",
      "task15", "task16"
  ]
  input_file = os.path.join(current_dir,
                            "../../models/tests/multitask_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


def load_solubility_data():
  """Loads solubility dataset"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["log-solubility"]
  task_type = "regression"
  input_file = os.path.join(current_dir, "../../models/tests/example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)

  return loader.featurize(input_file)


def load_butina_data():
  """Loads solubility dataset"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["task"]
  # task_type = "regression"
  input_file = os.path.join(current_dir,
                            "../../models/tests/butina_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)

  return loader.featurize(input_file)


class TestSplitter(unittest.TestCase):
  """
  Test some basic splitters.
  """

  def test_random_group_split(self):
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()

    groups = [0, 4, 1, 2, 3, 7, 0, 3, 1, 0]
    # 0 1 2 3 4 5 6 7 8 9
@@ -48,7 +106,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask RandomSplitter class.
    """
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    random_splitter = dc.splits.RandomSplitter()
    train_data, valid_data, test_data = \
      random_splitter.train_valid_test_split(
@@ -65,7 +123,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask IndexSplitter class.
    """
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    random_splitter = dc.splits.IndexSplitter()
    train_data, valid_data, test_data = \
      random_splitter.train_valid_test_split(
@@ -86,7 +144,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask ScaffoldSplitter class.
    """
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    scaffold_splitter = dc.splits.ScaffoldSplitter()
    train_data, valid_data, test_data = \
      scaffold_splitter.train_valid_test_split(
@@ -99,7 +157,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask Fingerprint class.
    """
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    assert (len(solubility_dataset.X) == 10)
    scaffold_splitter = dc.splits.FingerprintSplitter()
    train_data, valid_data, test_data = \
@@ -116,7 +174,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask SingletaskStratifiedSplitter class.
    """
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    stratified_splitter = dc.splits.ScaffoldSplitter()
    train_data, valid_data, test_data = \
      stratified_splitter.train_valid_test_split(
@@ -133,7 +191,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask MaxMinSplitter class.
    """
    solubility_dataset = dc.data.tests.load_butina_data()
    solubility_dataset = load_butina_data()
    maxmin_splitter = dc.splits.MaxMinSplitter()
    train_data, valid_data, test_data = \
      maxmin_splitter.train_valid_test_split(
@@ -146,7 +204,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask ButinaSplitter class.
    """
    solubility_dataset = dc.data.tests.load_butina_data()
    solubility_dataset = load_butina_data()
    butina_splitter = dc.splits.ButinaSplitter()
    train_data, valid_data, test_data = \
      butina_splitter.train_valid_test_split(
@@ -177,7 +235,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask RandomSplitter class.
    """
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    random_splitter = dc.splits.RandomSplitter()
    ids_set = set(solubility_dataset.ids)

@@ -202,7 +260,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask IndexSplitter class.
    """
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    index_splitter = dc.splits.IndexSplitter()
    ids_set = set(solubility_dataset.ids)

@@ -232,7 +290,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test singletask ScaffoldSplitter class.
    """
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    scaffold_splitter = dc.splits.ScaffoldSplitter()
    ids_set = set(solubility_dataset.ids)

@@ -469,7 +527,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test multitask RandomSplitter class.
    """
    multitask_dataset = dc.data.tests.load_multitask_data()
    multitask_dataset = load_multitask_data()
    random_splitter = dc.splits.RandomSplitter()
    train_data, valid_data, test_data = \
      random_splitter.train_valid_test_split(
@@ -482,7 +540,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test multitask IndexSplitter class.
    """
    multitask_dataset = dc.data.tests.load_multitask_data()
    multitask_dataset = load_multitask_data()
    index_splitter = dc.splits.IndexSplitter()
    train_data, valid_data, test_data = \
      index_splitter.train_valid_test_split(
@@ -495,7 +553,7 @@ class TestSplitter(unittest.TestCase):
    """
    Test multitask ScaffoldSplitter class.
    """
    multitask_dataset = dc.data.tests.load_multitask_data()
    multitask_dataset = load_multitask_data()
    scaffold_splitter = dc.splits.ScaffoldSplitter()
    train_data, valid_data, test_data = \
      scaffold_splitter.train_valid_test_split(
@@ -511,7 +569,7 @@ class TestSplitter(unittest.TestCase):
    # sparsity is determined by number of w weights that are 0 for a given
    # task structure of w np array is such that each row corresponds to a
    # sample. The loaded sparse dataset has many rows with only zeros
    sparse_dataset = dc.data.tests.load_sparse_multitask_dataset()
    sparse_dataset = load_sparse_multitask_dataset()

    stratified_splitter = dc.splits.RandomStratifiedSplitter()
    datasets = stratified_splitter.train_valid_test_split(
@@ -526,7 +584,7 @@ class TestSplitter(unittest.TestCase):

  def test_indice_split(self):

    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    random_splitter = dc.splits.IndiceSplitter(
        valid_indices=[7], test_indices=[8])
    train_data, valid_data, test_data = \
@@ -538,7 +596,7 @@ class TestSplitter(unittest.TestCase):

  def test_random_seed(self):
    """Test that splitters use the random seed correctly."""
    dataset = dc.data.tests.load_solubility_data()
    dataset = load_solubility_data()
    splitter = dc.splits.RandomSplitter()
    train1, valid1, test1 = splitter.train_valid_test_split(dataset, seed=1)
    train2, valid2, test2 = splitter.train_valid_test_split(dataset, seed=2)
+1 −0
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@ Gathers all transformers in one place for convenient imports
"""
from deepchem.trans.transformers import undo_transforms
from deepchem.trans.transformers import undo_grad_transforms
from deepchem.trans.transformers import Transformer
from deepchem.trans.transformers import LogTransformer
from deepchem.trans.transformers import ClippingTransformer
from deepchem.trans.transformers import NormalizationTransformer
+96 −16

File changed.

Preview size limit exceeded, changes collapsed.