More removals (3909d44b) · Commits · 钟慕尧 / deepchem

deepchem/data/tests/example.fasta

deleted100644 → 0

+0 −6

Original line number	Diff line number	Diff line
		>seq0
		ACGTCCCACACGATGCATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGAT
		>seq1
		GTCGATGCATGCTAGCTAGCTAGCTAGCTACGATCGATCGATCGTACGATCGATCGAT
		>seq2
		ACACATCATCATTACTATATATTATATATCGATCGATCGATCGATCGTACGTAGCTAG

deepchem/data/tests/test_data_loader.py

deleted100644 → 0

+0 −167

Original line number	Diff line number	Diff line
		"""
		Tests for FeaturizedSamples class
		"""
		__author__ = "Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "MIT"

		import os
		import unittest
		import tempfile
		import shutil
		import deepchem as dc


		class TestDataLoader(unittest.TestCase):
		"""
		Test DataLoader
		"""

		def setUp(self):
		super(TestDataLoader, self).setUp()
		self.current_dir = os.path.dirname(os.path.abspath(__file__))

		def unlabelled_test(self):
		input_file = os.path.join(self.current_dir,
		"../../data/tests/no_labels.csv")
		featurizer = dc.feat.CircularFingerprint(size=1024)
		loader = dc.data.CSVLoader(
		tasks=[], smiles_field="smiles", featurizer=featurizer)
		loader.featurize(input_file)

		def scaffold_test_train_valid_test_split(self):
		"""Test of singletask RF ECFP regression API."""
		splittype = "scaffold"
		input_transforms = []
		output_transforms = ["normalize"]
		model_params = {}
		tasks = ["log-solubility"]
		task_type = "regression"
		task_types = {task: task_type for task in tasks}
		input_file = os.path.join(self.current_dir,
		"../../models/tests/example.csv")
		featurizer = dc.feat.CircularFingerprint(size=1024)

		input_file = os.path.join(self.current_dir, input_file)
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)

		dataset = loader.featurize(input_file)

		# Splits featurized samples into train/test
		splitter = dc.splits.ScaffoldSplitter()
		train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
		dataset)
		assert len(train_dataset) == 8
		assert len(valid_dataset) == 1
		assert len(test_dataset) == 1

		def scaffold_test_train_test_split(self):
		"""Test of singletask RF ECFP regression API."""
		splittype = "scaffold"
		input_transforms = []
		output_transforms = ["normalize"]
		model_params = {}
		tasks = ["log-solubility"]
		task_type = "regression"
		task_types = {task: task_type for task in tasks}
		input_file = os.path.join(self.current_dir,
		"../../models/tests/example.csv")
		featurizer = dc.feat.CircularFingerprint(size=1024)

		input_file = os.path.join(self.current_dir, input_file)
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)

		dataset = loader.featurize(input_file)

		# Splits featurized samples into train/test
		splitter = dc.splits.ScaffoldSplitter()
		train_dataset, test_dataset = splitter.train_test_split(dataset)
		assert len(train_dataset) == 8
		assert len(test_dataset) == 2

		def random_test_train_valid_test_split(self):
		"""Test of singletask RF ECFP regression API."""
		input_transforms = []
		output_transforms = ["normalize"]
		model_params = {}
		tasks = ["log-solubility"]
		task_type = "regression"
		task_types = {task: task_type for task in tasks}
		input_file = os.path.join(self.current_dir,
		"../../models/tests/example.csv")
		featurizer = dc.feat.CircularFingerprint(size=1024)

		input_file = os.path.join(self.current_dir, input_file)
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)

		dataset = loader.featurize(input_file)

		# Splits featurized samples into train/test
		splitter = dc.splits.RandomSplitter()
		train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
		dataset)
		assert len(train_dataset) == 8
		assert len(valid_dataset) == 1
		assert len(test_dataset) == 1

		def random_test_train_test_split(self):
		"""Test of singletask RF ECFP regression API."""
		#splittype = "random"
		model_params = {}
		tasks = ["log-solubility"]
		task_type = "regression"
		task_types = {task: task_type for task in tasks}
		input_file = os.path.join(self.current_dir,
		"../../models/tests/example.csv")
		featurizer = dc.feat.CircularFingerprint(size=1024)
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)

		dataset = loader.featurize(input_file)

		# Splits featurized samples into train/test
		splitter = dc.splits.RandomSplitter()
		train_dataset, test_dataset = splitter.train_test_split(dataset)
		assert len(train_dataset) == 8
		assert len(test_dataset) == 2

		def test_log_solubility_dataset(self):
		"""Test of loading for simple log-solubility dataset."""
		current_dir = os.path.dirname(os.path.realpath(__file__))
		input_file = "../../models/tests/example.csv"
		input_file = os.path.join(current_dir, input_file)

		tasks = ["log-solubility"]
		smiles_field = "smiles"
		loader = dc.data.CSVLoader(
		tasks=tasks,
		smiles_field="smiles",
		featurizer=dc.feat.CircularFingerprint(size=1024))
		dataset = loader.featurize(input_file)

		assert len(dataset) == 10

		def test_dataset_move(self):
		"""Test that dataset can be moved and reloaded."""
		base_dir = tempfile.mkdtemp()
		data_dir = os.path.join(base_dir, "data")
		moved_data_dir = os.path.join(base_dir, "moved_data")
		dataset_file = os.path.join(self.current_dir,
		"../../models/tests/example.csv")

		featurizer = dc.feat.CircularFingerprint(size=1024)
		tasks = ["log-solubility"]
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)
		featurized_dataset = loader.featurize(dataset_file, data_dir)
		n_dataset = len(featurized_dataset)

		# Now perform move
		shutil.move(data_dir, moved_data_dir)

		moved_featurized_dataset = dc.data.DiskDataset(moved_data_dir)

		assert len(moved_featurized_dataset) == n_dataset

deepchem/data/tests/test_image_dataset.py

deleted100644 → 0

+0 −92

Original line number	Diff line number	Diff line
		"""
		Tests for ImageDataset class
		"""
		__author__ = "Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "MIT"

		import unittest
		import numpy as np
		import deepchem as dc
		import os
		from tensorflow.python.framework import test_util


		class TestImageDataset(test_util.TensorFlowTestCase):
		"""
		Test ImageDataset class.
		"""

		def test_load_images(self):
		"""Test that ImageDataset loads images."""

		path = os.path.join(os.path.dirname(__file__), 'images')
		files = [os.path.join(path, f) for f in os.listdir(path)]

		# First try using images for X.

		ds1 = dc.data.ImageDataset(files, np.random.random(10))
		x_shape, y_shape, w_shape, ids_shape = ds1.get_shape()
		np.testing.assert_array_equal([10, 28, 28], x_shape)
		np.testing.assert_array_equal([10], y_shape)
		np.testing.assert_array_equal([10], w_shape)
		np.testing.assert_array_equal([10], ids_shape)
		np.testing.assert_array_equal(ds1.X.shape, x_shape)
		np.testing.assert_array_equal(ds1.y.shape, y_shape)
		np.testing.assert_array_equal(ds1.w.shape, w_shape)
		np.testing.assert_array_equal(ds1.ids.shape, ids_shape)

		# Now try using images for y.

		ds2 = dc.data.ImageDataset(np.random.random(10), files)
		x_shape, y_shape, w_shape, ids_shape = ds2.get_shape()
		np.testing.assert_array_equal([10], x_shape)
		np.testing.assert_array_equal([10, 28, 28], y_shape)
		np.testing.assert_array_equal([10, 1], w_shape)
		np.testing.assert_array_equal([10], ids_shape)
		np.testing.assert_array_equal(ds2.X.shape, x_shape)
		np.testing.assert_array_equal(ds2.y.shape, y_shape)
		np.testing.assert_array_equal(ds2.w.shape, w_shape)
		np.testing.assert_array_equal(ds2.ids.shape, ids_shape)
		np.testing.assert_array_equal(ds1.X, ds2.y)

		def test_itersamples(self):
		"""Test iterating samples of an ImageDataset."""

		path = os.path.join(os.path.dirname(__file__), 'images')
		files = [os.path.join(path, f) for f in os.listdir(path)]
		ds = dc.data.ImageDataset(files, np.random.random(10))
		X = ds.X
		i = 0
		for x, y, w, id in ds.itersamples():
		np.testing.assert_array_equal(x, X[i])
		assert y == ds.y[i]
		assert w == ds.w[i]
		assert id == ds.ids[i]
		i += 1
		assert i == 10

		def test_iterbatches(self):
		"""Test iterating batches of an ImageDataset."""

		path = os.path.join(os.path.dirname(__file__), 'images')
		files = [os.path.join(path, f) for f in os.listdir(path)]
		ds = dc.data.ImageDataset(files, np.random.random(10))
		X = ds.X
		iterated_ids = set()
		for x, y, w, ids in ds.iterbatches(2):
		np.testing.assert_array_equal([2, 28, 28], x.shape)
		np.testing.assert_array_equal([2], y.shape)
		np.testing.assert_array_equal([2], w.shape)
		np.testing.assert_array_equal([2], ids.shape)
		for i in (0, 1):
		assert ids[i] in files
		assert ids[i] not in iterated_ids
		iterated_ids.add(ids[i])
		index = files.index(ids[i])
		np.testing.assert_array_equal(x[i], X[index])
		assert len(iterated_ids) == 10


		if __name__ == "__main__":
		unittest.main()

deepchem/data/tests/test_load.py

deleted100644 → 0

+0 −162

Original line number	Diff line number	Diff line
		"""
		Testing singletask/multitask data loading capabilities.
		"""
		__author__ = "Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "MIT"

		import os
		import shutil
		import logging
		import unittest
		import tempfile
		import deepchem as dc
		import numpy as np

		logger = logging.getLogger(__name__)


		class TestLoad(unittest.TestCase):
		"""
		Test singletask/multitask data loading.
		"""

		def test_move_load(self):
		"""Test that datasets can be moved and loaded."""
		current_dir = os.path.dirname(os.path.realpath(__file__))
		base_dir = tempfile.mkdtemp()
		data_dir = os.path.join(base_dir, "data")
		moved_data_dir = os.path.join(base_dir, "moved_data")
		dataset_file = os.path.join(current_dir, "../../models/tests/example.csv")

		featurizer = dc.feat.CircularFingerprint(size=1024)
		tasks = ["log-solubility"]
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)
		dataset = loader.featurize(dataset_file, data_dir)

		X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
		shutil.move(data_dir, moved_data_dir)

		moved_dataset = dc.data.DiskDataset(moved_data_dir)

		X_moved, y_moved, w_moved, ids_moved = (moved_dataset.X, moved_dataset.y,
		moved_dataset.w, moved_dataset.ids)

		np.testing.assert_allclose(X, X_moved)
		np.testing.assert_allclose(y, y_moved)
		np.testing.assert_allclose(w, w_moved)
		np.testing.assert_array_equal(ids, ids_moved)

		def test_multiload(self):
		"""Check can re-use featurization for multiple task selections.
		"""
		# Only for debug!
		np.random.seed(123)

		current_dir = os.path.dirname(os.path.realpath(__file__))
		##Make directories to store the raw and featurized datasets.
		data_dir = tempfile.mkdtemp()

		# Load dataset
		logger.info("About to load dataset.")
		dataset_file = os.path.join(current_dir,
		"../../models/tests/multitask_example.csv")

		# Featurize tox21 dataset
		logger.info("About to featurize dataset.")
		featurizer = dc.feat.CircularFingerprint(size=1024)
		all_tasks = ["task%d" % i for i in range(17)]

		####### Do featurization
		loader = dc.data.CSVLoader(
		tasks=all_tasks, smiles_field="smiles", featurizer=featurizer)
		dataset = loader.featurize(dataset_file, data_dir)

		# Do train/valid split.
		X_multi, y_multi, w_multi, ids_multi = (dataset.X, dataset.y, dataset.w,
		dataset.ids)

		####### Do singletask load
		y_tasks, w_tasks, = [], []
		dataset = dc.data.DiskDataset(data_dir)
		for ind, task in enumerate(all_tasks):
		logger.info("Processing task %s" % task)

		X_task, y_task, w_task, ids_task = (dataset.X, dataset.y, dataset.w,
		dataset.ids)
		y_tasks.append(y_task[:, ind])
		w_tasks.append(w_task[:, ind])

		################## Do comparison
		for ind, task in enumerate(all_tasks):
		y_multi_task = y_multi[:, ind]
		w_multi_task = w_multi[:, ind]

		y_task = y_tasks[ind]
		w_task = w_tasks[ind]

		np.testing.assert_allclose(y_multi_task.flatten(), y_task.flatten())
		np.testing.assert_allclose(w_multi_task.flatten(), w_task.flatten())

		def test_singletask_matches_multitask_load(self):
		"""Check that singletask load and multitask load of dataset are same."""
		# Only for debug!
		np.random.seed(123)

		# Set some global variables up top
		reload = True

		current_dir = os.path.dirname(os.path.realpath(__file__))
		#Make directories to store the raw and featurized datasets.
		data_dir = tempfile.mkdtemp()

		# Load dataset
		logger.info("About to load dataset.")
		dataset_file = os.path.join(current_dir,
		"../../models/tests/multitask_example.csv")

		# Featurize tox21 dataset
		logger.info("About to featurize dataset.")
		featurizer = dc.feat.CircularFingerprint(size=1024)
		all_tasks = ["task%d" % i for i in range(17)]
		# For debugging purposes
		n_tasks = 17
		tasks = all_tasks[0:n_tasks]

		####### Do multitask load
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)
		dataset = loader.featurize(dataset_file, data_dir)

		# Do train/valid split.
		X_multi, y_multi, w_multi, ids_multi = (dataset.X, dataset.y, dataset.w,
		dataset.ids)

		####### Do singletask load
		y_tasks, w_tasks, ids_tasks = [], [], []
		for task in tasks:
		logger.info("Processing task %s" % task)
		if os.path.exists(data_dir):
		shutil.rmtree(data_dir)
		loader = dc.data.CSVLoader(
		tasks=[task], smiles_field="smiles", featurizer=featurizer)
		dataset = loader.featurize(dataset_file, data_dir)

		X_task, y_task, w_task, ids_task = (dataset.X, dataset.y, dataset.w,
		dataset.ids)
		y_tasks.append(y_task)
		w_tasks.append(w_task)
		ids_tasks.append(ids_task)

		################## Do comparison
		for ind, task in enumerate(tasks):
		y_multi_task = y_multi[:, ind]
		w_multi_task = w_multi[:, ind]

		y_task = y_tasks[ind]
		w_task = w_tasks[ind]
		ids_task = ids_tasks[ind]

		np.testing.assert_allclose(y_multi_task.flatten(), y_task.flatten())
		np.testing.assert_allclose(w_multi_task.flatten(), w_task.flatten())

deepchem/data/tests/test_merge.py

deleted100644 → 0

+0 −61

Original line number	Diff line number	Diff line
		"""
		Testing singletask/multitask dataset merging
		"""
		__author__ = "Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "MIT"

		import os
		import shutil
		import tempfile
		import unittest
		import deepchem as dc
		import numpy as np


		class TestMerge(unittest.TestCase):
		"""
		Test singletask/multitask dataset merging.
		"""

		def test_merge(self):
		"""Test that datasets can be merged."""
		current_dir = os.path.dirname(os.path.realpath(__file__))

		dataset_file = os.path.join(current_dir, "../../models/tests/example.csv")

		featurizer = dc.feat.CircularFingerprint(size=1024)
		tasks = ["log-solubility"]
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)
		first_dataset = loader.featurize(dataset_file)
		second_dataset = loader.featurize(dataset_file)

		merged_dataset = dc.data.DiskDataset.merge([first_dataset, second_dataset])

		assert len(merged_dataset) == len(first_dataset) + len(second_dataset)

		def test_subset(self):
		"""Tests that subsetting of datasets works."""
		current_dir = os.path.dirname(os.path.realpath(__file__))

		dataset_file = os.path.join(current_dir, "../../models/tests/example.csv")

		featurizer = dc.feat.CircularFingerprint(size=1024)
		tasks = ["log-solubility"]
		loader = dc.data.CSVLoader(
		tasks=tasks, smiles_field="smiles", featurizer=featurizer)
		dataset = loader.featurize(dataset_file, shard_size=2)

		shard_nums = [1, 2]

		orig_ids = dataset.ids
		_, _, _, ids_1 = dataset.get_shard(1)
		_, _, _, ids_2 = dataset.get_shard(2)

		subset = dataset.subset(shard_nums)
		after_ids = dataset.ids

		assert len(subset) == 4
		assert sorted(subset.ids) == sorted(np.concatenate([ids_1, ids_2]))
		assert list(orig_ids) == list(after_ids)

Admin message