Merge pull request #1327 from rbharath/bbbc (a131d400) · Commits · 钟慕尧 / deepchem

deepchem/data/data_loader.py

+58 −22

Original line number	Diff line number	Diff line
		@@ -24,6 +24,7 @@ from deepchem.data import DiskDataset
		from deepchem.data import NumpyDataset
		from scipy import misc
		import zipfile
		from PIL import Image


		def convert_df_to_numpy(df, tasks, verbose=False):
		@@ -335,6 +336,11 @@ class FASTALoader(DataLoader):
		class ImageLoader(DataLoader):
		"""
		Handles loading of image files.

		This class allows for loading of images in various formats. For user
		convenience, also accepts zip-files and directories of images and uses some
		limited intelligence to attempt to traverse subdirectories which contain
		images.
		"""

		def __init__(self, tasks=None):
		@@ -343,38 +349,68 @@ class ImageLoader(DataLoader):
		tasks = []
		self.tasks = tasks

		def featurize(self, input_files):
		def featurize(self, input_files, in_memory=True):
		"""Featurizes image files.

		Parameters
		----------
		input_files: list
		Each file in this list should either be of a supported image format (.png
		only for now) or of a compressed folder of image files (only .zip for now).
		Each file in this list should either be of a supported image format
		(.png, .tif only for now) or of a compressed folder of image files
		(only .zip for now).
		in_memory: bool
		If true, return in-memory NumpyDataset. Else return DiskDataset.
		"""
		if not isinstance(input_files, list):
		input_files = [input_files]

		images = []
		image_files = []
		# Sometimes zip files contain directories within. Traverse directories
		while len(input_files) > 0:
		remainder = []
		for input_file in input_files:
		filename, extension = os.path.splitext(input_file)
		# TODO(rbharath): Add support for more extensions
		if extension == ".zip":
		if os.path.isdir(input_file):
		dirfiles = [
		os.path.join(input_file, subfile)
		for subfile in os.listdir(input_file)
		]
		remainder += dirfiles
		elif extension == ".zip":
		zip_dir = tempfile.mkdtemp()
		zip_ref = zipfile.ZipFile(input_file, 'r')
		zip_ref.extractall(path=zip_dir)
		zip_ref.close()
		image_files += [
		zip_files = [
		os.path.join(zip_dir, name) for name in zip_ref.namelist()
		]
		elif extension == ".png":
		for zip_file in zip_files:
		_, extension = os.path.splitext(zip_file)
		if extension in [".png", ".tif"]:
		image_files.append(zip_file)
		elif extension in [".png", ".tif"]:
		image_files.append(input_file)
		else:
		raise ValueError("Unsupported file format")
		input_files = remainder

		images = []
		for image_file in image_files:
		_, extension = os.path.splitext(image_file)
		if extension == ".png":
		image = misc.imread(image_file)
		images.append(image)
		elif extension == ".tif":
		im = Image.open(image_file)
		imarray = np.array(im)
		images.append(imarray)
		else:
		raise ValueError("Unsupported image filetype for %s" % image_file)
		images = np.array(images)
		if in_memory:
		return NumpyDataset(images)
		else:
		# from_numpy currently requires labels. Make dummy labels
		labels = np.zeros(len(images))
		return DiskDataset.from_numpy(images, labels)

deepchem/data/tests/a_image.tif

0 → 100644

+15.9 KiB

File added.

No diff preview for this file type.

View file

deepchem/data/tests/test_image_loader.py

+38 −5

Original line number	Diff line number	Diff line
		@@ -19,13 +19,15 @@ class TestImageLoader(unittest.TestCase):

		def setUp(self):
		super(TestImageLoader, self).setUp()
		self.current_dir = os.path.dirname(os.path.abspath(__file__))
		self.tif_image_path = os.path.join(self.current_dir, "a_image.tif")

		# Create image file
		self.data_dir = tempfile.mkdtemp()
		self.face = misc.face()
		self.face_path = os.path.join(self.data_dir, "face.png")
		misc.imsave(self.face_path, self.face)
		self.face_copy_path = os.path.join(self.data_dir, "face.png")
		self.face_copy_path = os.path.join(self.data_dir, "face_copy.png")
		misc.imsave(self.face_copy_path, self.face)

		# Create zip of image file
		@@ -42,23 +44,54 @@ class TestImageLoader(unittest.TestCase):
		zipf.write(self.face_copy_path)
		zipf.close()

		def test_simple_load(self):
		# Create zip of multiple image files, multiple_types
		self.multitype_zip_path = os.path.join(self.data_dir, "multitype_face.zip")
		zipf = zipfile.ZipFile(self.multitype_zip_path, "w", zipfile.ZIP_DEFLATED)
		zipf.write(self.face_path)
		zipf.write(self.tif_image_path)
		zipf.close()

		# Create image directory
		self.image_dir = tempfile.mkdtemp()
		face_path = os.path.join(self.image_dir, "face.png")
		misc.imsave(face_path, self.face)
		face_copy_path = os.path.join(self.image_dir, "face_copy.png")
		misc.imsave(face_copy_path, self.face)

		def test_png_simple_load(self):
		loader = dc.data.ImageLoader()
		dataset = loader.featurize(self.face_path)
		# These are the known dimensions of face.png
		assert dataset.X.shape == (1, 768, 1024, 3)

		def test_multi_load(self):
		def test_tif_simple_load(self):
		loader = dc.data.ImageLoader()
		dataset = loader.featurize(self.tif_image_path)
		# TODO(rbharath): Where are the color channels?
		assert dataset.X.shape == (1, 44, 330)

		def test_png_multi_load(self):
		loader = dc.data.ImageLoader()
		dataset = loader.featurize([self.face_path, self.face_copy_path])
		assert dataset.X.shape == (2, 768, 1024, 3)

		def test_zip_load(self):
		def test_png_zip_load(self):
		loader = dc.data.ImageLoader()
		dataset = loader.featurize(self.zip_path)
		assert dataset.X.shape == (1, 768, 1024, 3)

		def test_multi_zip_load(self):
		def test_png_multi_zip_load(self):
		loader = dc.data.ImageLoader()
		dataset = loader.featurize(self.multi_zip_path)
		assert dataset.X.shape == (2, 768, 1024, 3)

		def test_multitype_zip_load(self):
		loader = dc.data.ImageLoader()
		dataset = loader.featurize(self.multitype_zip_path)
		# Since the different files have different shapes, makes an object array
		assert dataset.X.shape == (2,)

		def test_directory_load(self):
		loader = dc.data.ImageLoader()
		dataset = loader.featurize(self.image_dir)
		assert dataset.X.shape == (2, 768, 1024, 3)

deepchem/molnet/init.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -2,6 +2,7 @@ from __future__ import division
		from __future__ import unicode_literals

		from deepchem.molnet.load_function.bace_datasets import load_bace_classification, load_bace_regression
		from deepchem.molnet.load_function.bbbc_datasets import load_bbbc001
		from deepchem.molnet.load_function.bbbp_datasets import load_bbbp
		from deepchem.molnet.load_function.cell_counting_datasets import load_cell_counting
		from deepchem.molnet.load_function.chembl_datasets import load_chembl

deepchem/molnet/load_function/bbbc_datasets.py

0 → 100644

+75 −0

Original line number	Diff line number	Diff line
		"""
		BBBC Dataset loader.

		This file contains image loaders for the BBBC dataset collection (https://data.broadinstitute.org/bbbc/image_sets.html).
		"""

		from __future__ import division
		from __future__ import unicode_literals

		import os
		import numpy as np
		import logging
		import deepchem

		logger = logging.getLogger(__name__)


		def load_bbbc001(split='index', reload=True):
		"""Load BBBC001 dataset

		This dataset contains 6 images of human HT29 colon cancer cells. The task is to learn to predict the cell counts in these images. This dataset is too small to serve to train algorithms, but might serve as a good test dataset. https://data.broadinstitute.org/bbbc/BBBC001/
		"""
		# Featurize BBBC001 dataset
		bbbc001_tasks = ["cell-count"]
		data_dir = deepchem.utils.get_data_dir()
		if reload:
		save_dir = os.path.join(data_dir, "bbbc001/" + str(split))
		loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
		save_dir)
		if loaded:
		return bbbc001_tasks, all_dataset, transformers
		dataset_file = os.path.join(data_dir, "BBBC001_v1_images_tif.zip")
		labels_file = os.path.join(data_dir, "BBBC001_v1_counts.txt")

		if not os.path.exists(dataset_file):
		deepchem.utils.download_url(
		'https://data.broadinstitute.org/bbbc/BBBC001/BBBC001_v1_images_tif.zip'
		)
		if not os.path.exists(labels_file):
		deepchem.utils.download_url(
		'https://data.broadinstitute.org/bbbc/BBBC001/BBBC001_v1_counts.txt')
		# Featurize Images into NumpyArrays
		loader = deepchem.data.ImageLoader()
		dataset = loader.featurize(dataset_file, in_memory=False)

		# Load text file with labels
		with open(labels_file) as f:
		content = f.readlines()
		# Strip the first line which holds field labels
		lines = [x.strip() for x in content][1:]
		# Format is: Image_name count1 count2
		lines = [x.split("\t") for x in lines]
		counts = [(float(x[1]) + float(x[2])) / 2.0 for x in lines]
		y = np.array(counts)

		# This is kludgy way to add y to dataset. Can be done better?
		dataset = deepchem.data.DiskDataset.from_numpy(dataset.X, y)

		if split == None:
		return bbbc001_tasks, (dataset, None, None), transformers

		splitters = {
		'index': deepchem.splits.IndexSplitter(),
		'random': deepchem.splits.RandomSplitter(),
		}
		if split not in splitters:
		raise ValueError("Only index and random splits supported.")
		splitter = splitters[split]

		train, valid, test = splitter.train_valid_test_split(dataset)
		all_dataset = (train, valid, test)
		if reload:
		deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
		transformers)
		return bbbc001_tasks, all_dataset, transformers

Admin message