Commit 3909d44b authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

More removals

parent 7c0ac684
Loading
Loading
Loading
Loading

deepchem/data/tests/example.fasta

deleted100644 → 0
+0 −6
Original line number Diff line number Diff line
>seq0
ACGTCCCACACGATGCATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGAT
>seq1
GTCGATGCATGCTAGCTAGCTAGCTAGCTACGATCGATCGATCGTACGATCGATCGAT
>seq2
ACACATCATCATTACTATATATTATATATCGATCGATCGATCGATCGTACGTAGCTAG
+0 −167
Original line number Diff line number Diff line
"""
Tests for FeaturizedSamples class
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import os
import unittest
import tempfile
import shutil
import deepchem as dc


class TestDataLoader(unittest.TestCase):
  """
  Test DataLoader
  """

  def setUp(self):
    super(TestDataLoader, self).setUp()
    self.current_dir = os.path.dirname(os.path.abspath(__file__))

  def unlabelled_test(self):
    input_file = os.path.join(self.current_dir,
                              "../../data/tests/no_labels.csv")
    featurizer = dc.feat.CircularFingerprint(size=1024)
    loader = dc.data.CSVLoader(
        tasks=[], smiles_field="smiles", featurizer=featurizer)
    loader.featurize(input_file)

  def scaffold_test_train_valid_test_split(self):
    """Test of singletask RF ECFP regression API."""
    splittype = "scaffold"
    input_transforms = []
    output_transforms = ["normalize"]
    model_params = {}
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(self.current_dir,
                              "../../models/tests/example.csv")
    featurizer = dc.feat.CircularFingerprint(size=1024)

    input_file = os.path.join(self.current_dir, input_file)
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)

    dataset = loader.featurize(input_file)

    # Splits featurized samples into train/test
    splitter = dc.splits.ScaffoldSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset)
    assert len(train_dataset) == 8
    assert len(valid_dataset) == 1
    assert len(test_dataset) == 1

  def scaffold_test_train_test_split(self):
    """Test of singletask RF ECFP regression API."""
    splittype = "scaffold"
    input_transforms = []
    output_transforms = ["normalize"]
    model_params = {}
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(self.current_dir,
                              "../../models/tests/example.csv")
    featurizer = dc.feat.CircularFingerprint(size=1024)

    input_file = os.path.join(self.current_dir, input_file)
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)

    dataset = loader.featurize(input_file)

    # Splits featurized samples into train/test
    splitter = dc.splits.ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(dataset)
    assert len(train_dataset) == 8
    assert len(test_dataset) == 2

  def random_test_train_valid_test_split(self):
    """Test of singletask RF ECFP regression API."""
    input_transforms = []
    output_transforms = ["normalize"]
    model_params = {}
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(self.current_dir,
                              "../../models/tests/example.csv")
    featurizer = dc.feat.CircularFingerprint(size=1024)

    input_file = os.path.join(self.current_dir, input_file)
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)

    dataset = loader.featurize(input_file)

    # Splits featurized samples into train/test
    splitter = dc.splits.RandomSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset)
    assert len(train_dataset) == 8
    assert len(valid_dataset) == 1
    assert len(test_dataset) == 1

  def random_test_train_test_split(self):
    """Test of singletask RF ECFP regression API."""
    #splittype = "random"
    model_params = {}
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(self.current_dir,
                              "../../models/tests/example.csv")
    featurizer = dc.feat.CircularFingerprint(size=1024)
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)

    dataset = loader.featurize(input_file)

    # Splits featurized samples into train/test
    splitter = dc.splits.RandomSplitter()
    train_dataset, test_dataset = splitter.train_test_split(dataset)
    assert len(train_dataset) == 8
    assert len(test_dataset) == 2

  def test_log_solubility_dataset(self):
    """Test of loading for simple log-solubility dataset."""
    current_dir = os.path.dirname(os.path.realpath(__file__))
    input_file = "../../models/tests/example.csv"
    input_file = os.path.join(current_dir, input_file)

    tasks = ["log-solubility"]
    smiles_field = "smiles"
    loader = dc.data.CSVLoader(
        tasks=tasks,
        smiles_field="smiles",
        featurizer=dc.feat.CircularFingerprint(size=1024))
    dataset = loader.featurize(input_file)

    assert len(dataset) == 10

  def test_dataset_move(self):
    """Test that dataset can be moved and reloaded."""
    base_dir = tempfile.mkdtemp()
    data_dir = os.path.join(base_dir, "data")
    moved_data_dir = os.path.join(base_dir, "moved_data")
    dataset_file = os.path.join(self.current_dir,
                                "../../models/tests/example.csv")

    featurizer = dc.feat.CircularFingerprint(size=1024)
    tasks = ["log-solubility"]
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    featurized_dataset = loader.featurize(dataset_file, data_dir)
    n_dataset = len(featurized_dataset)

    # Now perform move
    shutil.move(data_dir, moved_data_dir)

    moved_featurized_dataset = dc.data.DiskDataset(moved_data_dir)

    assert len(moved_featurized_dataset) == n_dataset
+0 −92
Original line number Diff line number Diff line
"""
Tests for ImageDataset class
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import unittest
import numpy as np
import deepchem as dc
import os
from tensorflow.python.framework import test_util


class TestImageDataset(test_util.TensorFlowTestCase):
  """
  Test ImageDataset class.
  """

  def test_load_images(self):
    """Test that ImageDataset loads images."""

    path = os.path.join(os.path.dirname(__file__), 'images')
    files = [os.path.join(path, f) for f in os.listdir(path)]

    # First try using images for X.

    ds1 = dc.data.ImageDataset(files, np.random.random(10))
    x_shape, y_shape, w_shape, ids_shape = ds1.get_shape()
    np.testing.assert_array_equal([10, 28, 28], x_shape)
    np.testing.assert_array_equal([10], y_shape)
    np.testing.assert_array_equal([10], w_shape)
    np.testing.assert_array_equal([10], ids_shape)
    np.testing.assert_array_equal(ds1.X.shape, x_shape)
    np.testing.assert_array_equal(ds1.y.shape, y_shape)
    np.testing.assert_array_equal(ds1.w.shape, w_shape)
    np.testing.assert_array_equal(ds1.ids.shape, ids_shape)

    # Now try using images for y.

    ds2 = dc.data.ImageDataset(np.random.random(10), files)
    x_shape, y_shape, w_shape, ids_shape = ds2.get_shape()
    np.testing.assert_array_equal([10], x_shape)
    np.testing.assert_array_equal([10, 28, 28], y_shape)
    np.testing.assert_array_equal([10, 1], w_shape)
    np.testing.assert_array_equal([10], ids_shape)
    np.testing.assert_array_equal(ds2.X.shape, x_shape)
    np.testing.assert_array_equal(ds2.y.shape, y_shape)
    np.testing.assert_array_equal(ds2.w.shape, w_shape)
    np.testing.assert_array_equal(ds2.ids.shape, ids_shape)
    np.testing.assert_array_equal(ds1.X, ds2.y)

  def test_itersamples(self):
    """Test iterating samples of an ImageDataset."""

    path = os.path.join(os.path.dirname(__file__), 'images')
    files = [os.path.join(path, f) for f in os.listdir(path)]
    ds = dc.data.ImageDataset(files, np.random.random(10))
    X = ds.X
    i = 0
    for x, y, w, id in ds.itersamples():
      np.testing.assert_array_equal(x, X[i])
      assert y == ds.y[i]
      assert w == ds.w[i]
      assert id == ds.ids[i]
      i += 1
    assert i == 10

  def test_iterbatches(self):
    """Test iterating batches of an ImageDataset."""

    path = os.path.join(os.path.dirname(__file__), 'images')
    files = [os.path.join(path, f) for f in os.listdir(path)]
    ds = dc.data.ImageDataset(files, np.random.random(10))
    X = ds.X
    iterated_ids = set()
    for x, y, w, ids in ds.iterbatches(2):
      np.testing.assert_array_equal([2, 28, 28], x.shape)
      np.testing.assert_array_equal([2], y.shape)
      np.testing.assert_array_equal([2], w.shape)
      np.testing.assert_array_equal([2], ids.shape)
      for i in (0, 1):
        assert ids[i] in files
        assert ids[i] not in iterated_ids
        iterated_ids.add(ids[i])
        index = files.index(ids[i])
        np.testing.assert_array_equal(x[i], X[index])
    assert len(iterated_ids) == 10


if __name__ == "__main__":
  unittest.main()

deepchem/data/tests/test_load.py

deleted100644 → 0
+0 −162
Original line number Diff line number Diff line
"""
Testing singletask/multitask data loading capabilities.
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import os
import shutil
import logging
import unittest
import tempfile
import deepchem as dc
import numpy as np

logger = logging.getLogger(__name__)


class TestLoad(unittest.TestCase):
  """
  Test singletask/multitask data loading.
  """

  def test_move_load(self):
    """Test that datasets can be moved and loaded."""
    current_dir = os.path.dirname(os.path.realpath(__file__))
    base_dir = tempfile.mkdtemp()
    data_dir = os.path.join(base_dir, "data")
    moved_data_dir = os.path.join(base_dir, "moved_data")
    dataset_file = os.path.join(current_dir, "../../models/tests/example.csv")

    featurizer = dc.feat.CircularFingerprint(size=1024)
    tasks = ["log-solubility"]
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(dataset_file, data_dir)

    X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
    shutil.move(data_dir, moved_data_dir)

    moved_dataset = dc.data.DiskDataset(moved_data_dir)

    X_moved, y_moved, w_moved, ids_moved = (moved_dataset.X, moved_dataset.y,
                                            moved_dataset.w, moved_dataset.ids)

    np.testing.assert_allclose(X, X_moved)
    np.testing.assert_allclose(y, y_moved)
    np.testing.assert_allclose(w, w_moved)
    np.testing.assert_array_equal(ids, ids_moved)

  def test_multiload(self):
    """Check can re-use featurization for multiple task selections.
    """
    # Only for debug!
    np.random.seed(123)

    current_dir = os.path.dirname(os.path.realpath(__file__))
    ##Make directories to store the raw and featurized datasets.
    data_dir = tempfile.mkdtemp()

    # Load dataset
    logger.info("About to load dataset.")
    dataset_file = os.path.join(current_dir,
                                "../../models/tests/multitask_example.csv")

    # Featurize tox21 dataset
    logger.info("About to featurize dataset.")
    featurizer = dc.feat.CircularFingerprint(size=1024)
    all_tasks = ["task%d" % i for i in range(17)]

    ####### Do featurization
    loader = dc.data.CSVLoader(
        tasks=all_tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(dataset_file, data_dir)

    # Do train/valid split.
    X_multi, y_multi, w_multi, ids_multi = (dataset.X, dataset.y, dataset.w,
                                            dataset.ids)

    ####### Do singletask load
    y_tasks, w_tasks, = [], []
    dataset = dc.data.DiskDataset(data_dir)
    for ind, task in enumerate(all_tasks):
      logger.info("Processing task %s" % task)

      X_task, y_task, w_task, ids_task = (dataset.X, dataset.y, dataset.w,
                                          dataset.ids)
      y_tasks.append(y_task[:, ind])
      w_tasks.append(w_task[:, ind])

    ################## Do comparison
    for ind, task in enumerate(all_tasks):
      y_multi_task = y_multi[:, ind]
      w_multi_task = w_multi[:, ind]

      y_task = y_tasks[ind]
      w_task = w_tasks[ind]

      np.testing.assert_allclose(y_multi_task.flatten(), y_task.flatten())
      np.testing.assert_allclose(w_multi_task.flatten(), w_task.flatten())

  def test_singletask_matches_multitask_load(self):
    """Check that singletask load and multitask load of dataset are same."""
    # Only for debug!
    np.random.seed(123)

    # Set some global variables up top
    reload = True

    current_dir = os.path.dirname(os.path.realpath(__file__))
    #Make directories to store the raw and featurized datasets.
    data_dir = tempfile.mkdtemp()

    # Load dataset
    logger.info("About to load dataset.")
    dataset_file = os.path.join(current_dir,
                                "../../models/tests/multitask_example.csv")

    # Featurize tox21 dataset
    logger.info("About to featurize dataset.")
    featurizer = dc.feat.CircularFingerprint(size=1024)
    all_tasks = ["task%d" % i for i in range(17)]
    # For debugging purposes
    n_tasks = 17
    tasks = all_tasks[0:n_tasks]

    ####### Do multitask load
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(dataset_file, data_dir)

    # Do train/valid split.
    X_multi, y_multi, w_multi, ids_multi = (dataset.X, dataset.y, dataset.w,
                                            dataset.ids)

    ####### Do singletask load
    y_tasks, w_tasks, ids_tasks = [], [], []
    for task in tasks:
      logger.info("Processing task %s" % task)
      if os.path.exists(data_dir):
        shutil.rmtree(data_dir)
      loader = dc.data.CSVLoader(
          tasks=[task], smiles_field="smiles", featurizer=featurizer)
      dataset = loader.featurize(dataset_file, data_dir)

      X_task, y_task, w_task, ids_task = (dataset.X, dataset.y, dataset.w,
                                          dataset.ids)
      y_tasks.append(y_task)
      w_tasks.append(w_task)
      ids_tasks.append(ids_task)

    ################## Do comparison
    for ind, task in enumerate(tasks):
      y_multi_task = y_multi[:, ind]
      w_multi_task = w_multi[:, ind]

      y_task = y_tasks[ind]
      w_task = w_tasks[ind]
      ids_task = ids_tasks[ind]

      np.testing.assert_allclose(y_multi_task.flatten(), y_task.flatten())
      np.testing.assert_allclose(w_multi_task.flatten(), w_task.flatten())

deepchem/data/tests/test_merge.py

deleted100644 → 0
+0 −61
Original line number Diff line number Diff line
"""
Testing singletask/multitask dataset merging
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import os
import shutil
import tempfile
import unittest
import deepchem as dc
import numpy as np


class TestMerge(unittest.TestCase):
  """
  Test singletask/multitask dataset merging.
  """

  def test_merge(self):
    """Test that datasets can be merged."""
    current_dir = os.path.dirname(os.path.realpath(__file__))

    dataset_file = os.path.join(current_dir, "../../models/tests/example.csv")

    featurizer = dc.feat.CircularFingerprint(size=1024)
    tasks = ["log-solubility"]
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    first_dataset = loader.featurize(dataset_file)
    second_dataset = loader.featurize(dataset_file)

    merged_dataset = dc.data.DiskDataset.merge([first_dataset, second_dataset])

    assert len(merged_dataset) == len(first_dataset) + len(second_dataset)

  def test_subset(self):
    """Tests that subsetting of datasets works."""
    current_dir = os.path.dirname(os.path.realpath(__file__))

    dataset_file = os.path.join(current_dir, "../../models/tests/example.csv")

    featurizer = dc.feat.CircularFingerprint(size=1024)
    tasks = ["log-solubility"]
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(dataset_file, shard_size=2)

    shard_nums = [1, 2]

    orig_ids = dataset.ids
    _, _, _, ids_1 = dataset.get_shard(1)
    _, _, _, ids_2 = dataset.get_shard(2)

    subset = dataset.subset(shard_nums)
    after_ids = dataset.ids

    assert len(subset) == 4
    assert sorted(subset.ids) == sorted(np.concatenate([ids_1, ids_2]))
    assert list(orig_ids) == list(after_ids)
Loading