Unverified Commit 92f679e5 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1439 from peastman/imagedataset

Created ImageDataset
parents 787bffa4 ffeb4b7c
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -52,7 +52,7 @@ def load_images_DR(split='random', seed=None):

  loader = deepchem.data.ImageLoader()
  dat = loader.featurize(
      image_full_paths, labels=labels, weights=weights, read_img=False)
      image_full_paths, labels=labels, weights=weights)
  if split == None:
    return dat

+2 −28
Original line number Diff line number Diff line
@@ -142,32 +142,6 @@ class DRModel(TensorGraph):
    # weighted_loss = WeightDecay(0.1, 'l2', in_layers=[weighted_loss])
    self.set_loss(weighted_loss)

  def default_generator(self,
                        dataset,
                        epochs=1,
                        predict=False,
                        deterministic=True,
                        pad_batches=True):
    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):
        feed_dict = dict()

        if None in X_b:
          # load images on the fly
          feed_dict[self.features[0]] = ImageLoader.load_img(ids_b)
        else:
          feed_dict[self.features[0]] = X_b

        if y_b is not None and not predict:
          feed_dict[self.labels[0]] = y_b
        if w_b is not None and not predict:
          feed_dict[self.task_weights[0]] = w_b

        yield feed_dict


def DRAccuracy(y, y_pred):
  y_pred = np.argmax(y_pred, 1)
+1 −0
Original line number Diff line number Diff line
@@ -11,6 +11,7 @@ from deepchem.data.datasets import pad_batch
from deepchem.data.datasets import Dataset
from deepchem.data.datasets import NumpyDataset
from deepchem.data.datasets import DiskDataset
from deepchem.data.datasets import ImageDataset
from deepchem.data.datasets import sparsify_features
from deepchem.data.datasets import densify_features
from deepchem.data.supports import *
+14 −27
Original line number Diff line number Diff line
@@ -20,8 +20,7 @@ from deepchem.utils.save import load_csv_files
from deepchem.utils.save import load_sdf_files
from deepchem.utils.genomics import encode_fasta_sequence
from deepchem.feat import UserDefinedFeaturizer
from deepchem.data import DiskDataset
from deepchem.data import NumpyDataset
from deepchem.data import DiskDataset, NumpyDataset, ImageDataset
from scipy import misc
import zipfile
from PIL import Image
@@ -230,7 +229,8 @@ class DataLoader(object):
          assert len(X) == len(ids)

        time2 = time.time()
        log("TIMING: featurizing shard %d took %0.3f s" %
        log(
            "TIMING: featurizing shard %d took %0.3f s" %
            (shard_num, time2 - time1), self.verbose)
        yield X, y, w, ids

@@ -294,7 +294,8 @@ class SDFLoader(DataLoader):

  def featurize_shard(self, shard):
    """Featurizes a shard of an input dataframe."""
    log("Currently featurizing feature_type: %s" %
    log(
        "Currently featurizing feature_type: %s" %
        self.featurizer.__class__.__name__, self.verbose)
    return featurize_mol_df(shard, self.featurizer, field=self.mol_field)

@@ -347,12 +348,7 @@ class ImageLoader(DataLoader):
      tasks = []
    self.tasks = tasks

  def featurize(self,
                input_files,
                labels=None,
                weights=None,
                read_img=True,
                in_memory=True):
  def featurize(self, input_files, labels=None, weights=None, in_memory=False):
    """Featurizes image files.

    Parameters
@@ -362,7 +358,7 @@ class ImageLoader(DataLoader):
      (.png, .tif only for now) or of a compressed folder of image files
      (only .zip for now).
    in_memory: bool
      If true, return in-memory NumpyDataset. Else return DiskDataset.
      If true, return in-memory NumpyDataset. Else return ImageDataset.
    """
    if not isinstance(input_files, list):
      input_files = [input_files]
@@ -398,20 +394,11 @@ class ImageLoader(DataLoader):
          raise ValueError("Unsupported file format")
      input_files = remainder

    if read_img:
      X = self.load_img(image_files)
    else:
      X = [None] * len(image_files)
    if in_memory:
      return NumpyDataset(X, y=labels, w=weights, ids=image_files)

      return NumpyDataset(
          self.load_img(image_files), y=labels, w=weights, ids=image_files)
    else:
      # from_numpy currently requires labels. Make dummy labels
      if labels is None:
        labels = np.zeros((len(image_files), 1))
      if weights is None:
        weights = np.zeros((len(image_files), 1))
      return DiskDataset.from_numpy(X, labels, w=weights, ids=image_files)
      return ImageDataset(image_files, y=labels, w=weights, ids=image_files)

  @staticmethod
  def load_img(image_files):
+203 −0
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@ from __future__ import unicode_literals
import json
import os
import math
import deepchem as dc
import numpy as np
import pandas as pd
import random
@@ -1339,6 +1340,208 @@ class DiskDataset(Dataset):
    return self.metadata_df["y_stds"]


class ImageDataset(Dataset):
  """A Dataset that loads data from image files on disk."""

  def __init__(self, X, y, w=None, ids=None):
    """Create a dataset whose X and/or y array is defined by image files on disk.

    Parameters
    ----------
    X: ndarray or list of strings
      The dataset's input data.  This may be either a single NumPy array directly
      containing the data, or a list containing the paths to the image files
    y: ndarray or list of strings
      The dataset's labels.  This may be either a single NumPy array directly
      containing the data, or a list containing the paths to the image files
    w: ndarray
      a 1D or 2D array containing the weights for each sample or sample/task pair
    ids: ndarray
      the sample IDs
    """
    n_samples = len(X)
    if y is None:
      y = np.zeros((n_samples,))
    self._X_shape = self._find_array_shape(X)
    self._y_shape = self._find_array_shape(y)
    if w is None:
      w = np.ones(self._y_shape[:2])
    if ids is None:
      if not isinstance(X, np.ndarray):
        ids = X
      elif not isinstance(y, np.ndarray):
        ids = y
      else:
        ids = np.arange(n_samples)
    self._X = X
    self._y = y
    self._w = w
    self._ids = np.array(ids, dtype=object)

  def _find_array_shape(self, array):
    if isinstance(array, np.ndarray):
      return array.shape
    image_shape = dc.data.ImageLoader.load_img([array[0]]).shape[1:]
    return np.concatenate([[len(array)], image_shape])

  def __len__(self):
    """
    Get the number of elements in the dataset.
    """
    return self._X_shape[0]

  def get_shape(self):
    """Get the shape of the dataset.

    Returns four tuples, giving the shape of the X, y, w, and ids arrays.
    """
    return self._X_shape, self._y_shape, self._w.shape, self._ids.shape

  def get_task_names(self):
    """Get the names of the tasks associated with this dataset."""
    if len(self._y_shape) < 2:
      return np.array([0])
    return np.arange(self._y_shape[1])

  @property
  def X(self):
    """Get the X vector for this dataset as a single numpy array."""
    if isinstance(self._X, np.ndarray):
      return self._X
    return dc.data.ImageLoader.load_img(self._X)

  @property
  def y(self):
    """Get the y vector for this dataset as a single numpy array."""
    if isinstance(self._y, np.ndarray):
      return self._y
    return dc.data.ImageLoader.load_img(self._y)

  @property
  def ids(self):
    """Get the ids vector for this dataset as a single numpy array."""
    return self._ids

  @property
  def w(self):
    """Get the weight vector for this dataset as a single numpy array."""
    return self._w

  def iterbatches(self,
                  batch_size=None,
                  epoch=0,
                  deterministic=False,
                  pad_batches=False):
    """Get an object that iterates over minibatches from the dataset.

    Each minibatch is returned as a tuple of four numpy arrays: (X, y, w, ids).
    """

    def iterate(dataset, batch_size, deterministic, pad_batches):
      n_samples = dataset._X_shape[0]
      if not deterministic:
        sample_perm = np.random.permutation(n_samples)
      else:
        sample_perm = np.arange(n_samples)
      if batch_size is None:
        batch_size = n_samples
      batch_idx = 0
      num_batches = np.math.ceil(n_samples / batch_size)
      while batch_idx < num_batches:
        start = batch_idx * batch_size
        end = min(n_samples, (batch_idx + 1) * batch_size)
        indices = range(start, end)
        perm_indices = sample_perm[indices]
        if isinstance(dataset._X, np.ndarray):
          X_batch = dataset._X[perm_indices]
        else:
          X_batch = dc.data.ImageLoader.load_img(
              [dataset._X[i] for i in perm_indices])
        if isinstance(dataset._y, np.ndarray):
          y_batch = dataset._y[perm_indices]
        else:
          y_batch = dc.data.ImageLoader.load_img(
              [dataset._y[i] for i in perm_indices])
        w_batch = dataset._w[perm_indices]
        ids_batch = dataset._ids[perm_indices]
        if pad_batches:
          (X_batch, y_batch, w_batch, ids_batch) = pad_batch(
              batch_size, X_batch, y_batch, w_batch, ids_batch)
        batch_idx += 1
        yield (X_batch, y_batch, w_batch, ids_batch)

    return iterate(self, batch_size, deterministic, pad_batches)

  def itersamples(self):
    """Get an object that iterates over the samples in the dataset.

    Example:

    >>> dataset = NumpyDataset(np.ones((2,2)))
    >>> for x, y, w, id in dataset.itersamples():
    ...   print(x.tolist(), y.tolist(), w.tolist(), id)
    [1.0, 1.0] [0.0] [0.0] 0
    [1.0, 1.0] [0.0] [0.0] 1
    """

    def get_image(array, index):
      if isinstance(array, np.ndarray):
        return array[index]
      return dc.data.ImageLoader.load_img([array[index]])[0]

    n_samples = self._X_shape[0]
    return ((get_image(self._X, i), get_image(self._y, i), self._w[i],
             self._ids[i]) for i in range(n_samples))

  def transform(self, fn, **args):
    """Construct a new dataset by applying a transformation to every sample in this dataset.

    The argument is a function that can be called as follows:

    >> newx, newy, neww = fn(x, y, w)

    It might be called only once with the whole dataset, or multiple times with
    different subsets of the data.  Each time it is called, it should transform
    the samples and return the transformed data.

    Parameters
    ----------
    fn: function
      A function to apply to each sample in the dataset

    Returns
    -------
    a newly constructed Dataset object
    """
    newx, newy, neww = fn(self.X, self.y, self.w)
    return NumpyDataset(newx, newy, neww, self.ids[:])

  def select(self, indices, select_dir=None):
    """Creates a new dataset from a selection of indices from self.

    TODO(rbharath): select_dir is here due to dc.splits always passing in
    splits.

    Parameters
    ----------
    indices: list
      List of indices to select.
    select_dir: string
      Ignored.
    """
    if isinstance(self._X, np.ndarray):
      X = self._X[indices]
    else:
      X = [self._X[i] for i in indices]
    if isinstance(self._y, np.ndarray):
      y = self._y[indices]
    else:
      y = [self._y[i] for i in indices]
    w = self._w[indices]
    ids = self._ids[indices]
    return ImageDataset(X, y, w, ids)


class Databag(object):
  """
  A utility class to iterate through multiple datasets together.
Loading