Merge pull request #1439 from peastman/imagedataset (92f679e5) · Commits · 钟慕尧 / deepchem

contrib/DiabeticRetinopathy/data.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -52,7 +52,7 @@ def load_images_DR(split='random', seed=None):

		loader = deepchem.data.ImageLoader()
		dat = loader.featurize(
		image_full_paths, labels=labels, weights=weights, read_img=False)
		image_full_paths, labels=labels, weights=weights)
		if split == None:
		return dat

contrib/DiabeticRetinopathy/model.py

+2 −28

Original line number	Diff line number	Diff line
		@@ -142,32 +142,6 @@ class DRModel(TensorGraph):
		# weighted_loss = WeightDecay(0.1, 'l2', in_layers=[weighted_loss])
		self.set_loss(weighted_loss)

		def default_generator(self,
		dataset,
		epochs=1,
		predict=False,
		deterministic=True,
		pad_batches=True):
		for epoch in range(epochs):
		for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
		batch_size=self.batch_size,
		deterministic=deterministic,
		pad_batches=pad_batches):
		feed_dict = dict()

		if None in X_b:
		# load images on the fly
		feed_dict[self.features[0]] = ImageLoader.load_img(ids_b)
		else:
		feed_dict[self.features[0]] = X_b

		if y_b is not None and not predict:
		feed_dict[self.labels[0]] = y_b
		if w_b is not None and not predict:
		feed_dict[self.task_weights[0]] = w_b

		yield feed_dict


		def DRAccuracy(y, y_pred):
		y_pred = np.argmax(y_pred, 1)

deepchem/data/init.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -11,6 +11,7 @@ from deepchem.data.datasets import pad_batch
		from deepchem.data.datasets import Dataset
		from deepchem.data.datasets import NumpyDataset
		from deepchem.data.datasets import DiskDataset
		from deepchem.data.datasets import ImageDataset
		from deepchem.data.datasets import sparsify_features
		from deepchem.data.datasets import densify_features
		from deepchem.data.supports import *

deepchem/data/data_loader.py

+14 −27

Original line number	Diff line number	Diff line
		@@ -20,8 +20,7 @@ from deepchem.utils.save import load_csv_files
		from deepchem.utils.save import load_sdf_files
		from deepchem.utils.genomics import encode_fasta_sequence
		from deepchem.feat import UserDefinedFeaturizer
		from deepchem.data import DiskDataset
		from deepchem.data import NumpyDataset
		from deepchem.data import DiskDataset, NumpyDataset, ImageDataset
		from scipy import misc
		import zipfile
		from PIL import Image
		@@ -230,7 +229,8 @@ class DataLoader(object):
		assert len(X) == len(ids)

		time2 = time.time()
		log("TIMING: featurizing shard %d took %0.3f s" %
		log(
		"TIMING: featurizing shard %d took %0.3f s" %
		(shard_num, time2 - time1), self.verbose)
		yield X, y, w, ids

		@@ -294,7 +294,8 @@ class SDFLoader(DataLoader):

		def featurize_shard(self, shard):
		"""Featurizes a shard of an input dataframe."""
		log("Currently featurizing feature_type: %s" %
		log(
		"Currently featurizing feature_type: %s" %
		self.featurizer.__class__.__name__, self.verbose)
		return featurize_mol_df(shard, self.featurizer, field=self.mol_field)

		@@ -347,12 +348,7 @@ class ImageLoader(DataLoader):
		tasks = []
		self.tasks = tasks

		def featurize(self,
		input_files,
		labels=None,
		weights=None,
		read_img=True,
		in_memory=True):
		def featurize(self, input_files, labels=None, weights=None, in_memory=False):
		"""Featurizes image files.

		Parameters
		@@ -362,7 +358,7 @@ class ImageLoader(DataLoader):
		(.png, .tif only for now) or of a compressed folder of image files
		(only .zip for now).
		in_memory: bool
		If true, return in-memory NumpyDataset. Else return DiskDataset.
		If true, return in-memory NumpyDataset. Else return ImageDataset.
		"""
		if not isinstance(input_files, list):
		input_files = [input_files]
		@@ -398,20 +394,11 @@ class ImageLoader(DataLoader):
		raise ValueError("Unsupported file format")
		input_files = remainder

		if read_img:
		X = self.load_img(image_files)
		else:
		X = [None] * len(image_files)
		if in_memory:
		return NumpyDataset(X, y=labels, w=weights, ids=image_files)

		return NumpyDataset(
		self.load_img(image_files), y=labels, w=weights, ids=image_files)
		else:
		# from_numpy currently requires labels. Make dummy labels
		if labels is None:
		labels = np.zeros((len(image_files), 1))
		if weights is None:
		weights = np.zeros((len(image_files), 1))
		return DiskDataset.from_numpy(X, labels, w=weights, ids=image_files)
		return ImageDataset(image_files, y=labels, w=weights, ids=image_files)

		@staticmethod
		def load_img(image_files):

deepchem/data/datasets.py

+203 −0

Original line number	Diff line number	Diff line
		@@ -6,6 +6,7 @@ from __future__ import unicode_literals
		import json
		import os
		import math
		import deepchem as dc
		import numpy as np
		import pandas as pd
		import random
		@@ -1339,6 +1340,208 @@ class DiskDataset(Dataset):
		return self.metadata_df["y_stds"]


		class ImageDataset(Dataset):
		"""A Dataset that loads data from image files on disk."""

		def __init__(self, X, y, w=None, ids=None):
		"""Create a dataset whose X and/or y array is defined by image files on disk.

		Parameters
		----------
		X: ndarray or list of strings
		The dataset's input data. This may be either a single NumPy array directly
		containing the data, or a list containing the paths to the image files
		y: ndarray or list of strings
		The dataset's labels. This may be either a single NumPy array directly
		containing the data, or a list containing the paths to the image files
		w: ndarray
		a 1D or 2D array containing the weights for each sample or sample/task pair
		ids: ndarray
		the sample IDs
		"""
		n_samples = len(X)
		if y is None:
		y = np.zeros((n_samples,))
		self._X_shape = self._find_array_shape(X)
		self._y_shape = self._find_array_shape(y)
		if w is None:
		w = np.ones(self._y_shape[:2])
		if ids is None:
		if not isinstance(X, np.ndarray):
		ids = X
		elif not isinstance(y, np.ndarray):
		ids = y
		else:
		ids = np.arange(n_samples)
		self._X = X
		self._y = y
		self._w = w
		self._ids = np.array(ids, dtype=object)

		def _find_array_shape(self, array):
		if isinstance(array, np.ndarray):
		return array.shape
		image_shape = dc.data.ImageLoader.load_img([array[0]]).shape[1:]
		return np.concatenate([[len(array)], image_shape])

		def __len__(self):
		"""
		Get the number of elements in the dataset.
		"""
		return self._X_shape[0]

		def get_shape(self):
		"""Get the shape of the dataset.

		Returns four tuples, giving the shape of the X, y, w, and ids arrays.
		"""
		return self._X_shape, self._y_shape, self._w.shape, self._ids.shape

		def get_task_names(self):
		"""Get the names of the tasks associated with this dataset."""
		if len(self._y_shape) < 2:
		return np.array([0])
		return np.arange(self._y_shape[1])

		@property
		def X(self):
		"""Get the X vector for this dataset as a single numpy array."""
		if isinstance(self._X, np.ndarray):
		return self._X
		return dc.data.ImageLoader.load_img(self._X)

		@property
		def y(self):
		"""Get the y vector for this dataset as a single numpy array."""
		if isinstance(self._y, np.ndarray):
		return self._y
		return dc.data.ImageLoader.load_img(self._y)

		@property
		def ids(self):
		"""Get the ids vector for this dataset as a single numpy array."""
		return self._ids

		@property
		def w(self):
		"""Get the weight vector for this dataset as a single numpy array."""
		return self._w

		def iterbatches(self,
		batch_size=None,
		epoch=0,
		deterministic=False,
		pad_batches=False):
		"""Get an object that iterates over minibatches from the dataset.

		Each minibatch is returned as a tuple of four numpy arrays: (X, y, w, ids).
		"""

		def iterate(dataset, batch_size, deterministic, pad_batches):
		n_samples = dataset._X_shape[0]
		if not deterministic:
		sample_perm = np.random.permutation(n_samples)
		else:
		sample_perm = np.arange(n_samples)
		if batch_size is None:
		batch_size = n_samples
		batch_idx = 0
		num_batches = np.math.ceil(n_samples / batch_size)
		while batch_idx < num_batches:
		start = batch_idx * batch_size
		end = min(n_samples, (batch_idx + 1) * batch_size)
		indices = range(start, end)
		perm_indices = sample_perm[indices]
		if isinstance(dataset._X, np.ndarray):
		X_batch = dataset._X[perm_indices]
		else:
		X_batch = dc.data.ImageLoader.load_img(
		[dataset._X[i] for i in perm_indices])
		if isinstance(dataset._y, np.ndarray):
		y_batch = dataset._y[perm_indices]
		else:
		y_batch = dc.data.ImageLoader.load_img(
		[dataset._y[i] for i in perm_indices])
		w_batch = dataset._w[perm_indices]
		ids_batch = dataset._ids[perm_indices]
		if pad_batches:
		(X_batch, y_batch, w_batch, ids_batch) = pad_batch(
		batch_size, X_batch, y_batch, w_batch, ids_batch)
		batch_idx += 1
		yield (X_batch, y_batch, w_batch, ids_batch)

		return iterate(self, batch_size, deterministic, pad_batches)

		def itersamples(self):
		"""Get an object that iterates over the samples in the dataset.

		Example:

		>>> dataset = NumpyDataset(np.ones((2,2)))
		>>> for x, y, w, id in dataset.itersamples():
		... print(x.tolist(), y.tolist(), w.tolist(), id)
		[1.0, 1.0] [0.0] [0.0] 0
		[1.0, 1.0] [0.0] [0.0] 1
		"""

		def get_image(array, index):
		if isinstance(array, np.ndarray):
		return array[index]
		return dc.data.ImageLoader.load_img([array[index]])[0]

		n_samples = self._X_shape[0]
		return ((get_image(self._X, i), get_image(self._y, i), self._w[i],
		self._ids[i]) for i in range(n_samples))

		def transform(self, fn, **args):
		"""Construct a new dataset by applying a transformation to every sample in this dataset.

		The argument is a function that can be called as follows:

		>> newx, newy, neww = fn(x, y, w)

		It might be called only once with the whole dataset, or multiple times with
		different subsets of the data. Each time it is called, it should transform
		the samples and return the transformed data.

		Parameters
		----------
		fn: function
		A function to apply to each sample in the dataset

		Returns
		-------
		a newly constructed Dataset object
		"""
		newx, newy, neww = fn(self.X, self.y, self.w)
		return NumpyDataset(newx, newy, neww, self.ids[:])

		def select(self, indices, select_dir=None):
		"""Creates a new dataset from a selection of indices from self.

		TODO(rbharath): select_dir is here due to dc.splits always passing in
		splits.

		Parameters
		----------
		indices: list
		List of indices to select.
		select_dir: string
		Ignored.
		"""
		if isinstance(self._X, np.ndarray):
		X = self._X[indices]
		else:
		X = [self._X[i] for i in indices]
		if isinstance(self._y, np.ndarray):
		y = self._y[indices]
		else:
		y = [self._y[i] for i in indices]
		w = self._w[indices]
		ids = self._ids[indices]
		return ImageDataset(X, y, w, ids)


		class Databag(object):
		"""
		A utility class to iterate through multiple datasets together.

Admin message