Unverified Commit a131d400 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1327 from rbharath/bbbc

Add BBBC001 Dataset
parents fd7c0266 d8d755fa
Loading
Loading
Loading
Loading
+58 −22
Original line number Diff line number Diff line
@@ -24,6 +24,7 @@ from deepchem.data import DiskDataset
from deepchem.data import NumpyDataset
from scipy import misc
import zipfile
from PIL import Image


def convert_df_to_numpy(df, tasks, verbose=False):
@@ -335,6 +336,11 @@ class FASTALoader(DataLoader):
class ImageLoader(DataLoader):
  """
  Handles loading of image files.

  This class allows for loading of images in various formats. For user
  convenience, also accepts zip-files and directories of images and uses some
  limited intelligence to attempt to traverse subdirectories which contain
  images.
  """

  def __init__(self, tasks=None):
@@ -343,38 +349,68 @@ class ImageLoader(DataLoader):
      tasks = []
    self.tasks = tasks

  def featurize(self, input_files):
  def featurize(self, input_files, in_memory=True):
    """Featurizes image files.

    Parameters
    ----------
    input_files: list
      Each file in this list should either be of a supported image format (.png
      only for now) or of a compressed folder of image files (only .zip for now).
      Each file in this list should either be of a supported image format
      (.png, .tif only for now) or of a compressed folder of image files
      (only .zip for now).
    in_memory: bool
      If true, return in-memory NumpyDataset. Else return DiskDataset.
    """
    if not isinstance(input_files, list):
      input_files = [input_files]

    images = []
    image_files = []
    # Sometimes zip files contain directories within. Traverse directories
    while len(input_files) > 0:
      remainder = []
      for input_file in input_files:
        filename, extension = os.path.splitext(input_file)
        # TODO(rbharath): Add support for more extensions
      if extension == ".zip":
        if os.path.isdir(input_file):
          dirfiles = [
              os.path.join(input_file, subfile)
              for subfile in os.listdir(input_file)
          ]
          remainder += dirfiles
        elif extension == ".zip":
          zip_dir = tempfile.mkdtemp()
          zip_ref = zipfile.ZipFile(input_file, 'r')
          zip_ref.extractall(path=zip_dir)
          zip_ref.close()
        image_files += [
          zip_files = [
              os.path.join(zip_dir, name) for name in zip_ref.namelist()
          ]
      elif extension == ".png":
          for zip_file in zip_files:
            _, extension = os.path.splitext(zip_file)
            if extension in [".png", ".tif"]:
              image_files.append(zip_file)
        elif extension in [".png", ".tif"]:
          image_files.append(input_file)
        else:
          raise ValueError("Unsupported file format")
      input_files = remainder

    images = []
    for image_file in image_files:
      _, extension = os.path.splitext(image_file)
      if extension == ".png":
        image = misc.imread(image_file)
        images.append(image)
      elif extension == ".tif":
        im = Image.open(image_file)
        imarray = np.array(im)
        images.append(imarray)
      else:
        raise ValueError("Unsupported image filetype for %s" % image_file)
    images = np.array(images)
    if in_memory:
      return NumpyDataset(images)
    else:
      # from_numpy currently requires labels. Make dummy labels
      labels = np.zeros(len(images))
      return DiskDataset.from_numpy(images, labels)
+15.9 KiB

File added.

No diff preview for this file type.

+38 −5
Original line number Diff line number Diff line
@@ -19,13 +19,15 @@ class TestImageLoader(unittest.TestCase):

  def setUp(self):
    super(TestImageLoader, self).setUp()
    self.current_dir = os.path.dirname(os.path.abspath(__file__))
    self.tif_image_path = os.path.join(self.current_dir, "a_image.tif")

    # Create image file
    self.data_dir = tempfile.mkdtemp()
    self.face = misc.face()
    self.face_path = os.path.join(self.data_dir, "face.png")
    misc.imsave(self.face_path, self.face)
    self.face_copy_path = os.path.join(self.data_dir, "face.png")
    self.face_copy_path = os.path.join(self.data_dir, "face_copy.png")
    misc.imsave(self.face_copy_path, self.face)

    # Create zip of image file
@@ -42,23 +44,54 @@ class TestImageLoader(unittest.TestCase):
    zipf.write(self.face_copy_path)
    zipf.close()

  def test_simple_load(self):
    # Create zip of multiple image files, multiple_types
    self.multitype_zip_path = os.path.join(self.data_dir, "multitype_face.zip")
    zipf = zipfile.ZipFile(self.multitype_zip_path, "w", zipfile.ZIP_DEFLATED)
    zipf.write(self.face_path)
    zipf.write(self.tif_image_path)
    zipf.close()

    # Create image directory
    self.image_dir = tempfile.mkdtemp()
    face_path = os.path.join(self.image_dir, "face.png")
    misc.imsave(face_path, self.face)
    face_copy_path = os.path.join(self.image_dir, "face_copy.png")
    misc.imsave(face_copy_path, self.face)

  def test_png_simple_load(self):
    loader = dc.data.ImageLoader()
    dataset = loader.featurize(self.face_path)
    # These are the known dimensions of face.png
    assert dataset.X.shape == (1, 768, 1024, 3)

  def test_multi_load(self):
  def test_tif_simple_load(self):
    loader = dc.data.ImageLoader()
    dataset = loader.featurize(self.tif_image_path)
    # TODO(rbharath): Where are the color channels?
    assert dataset.X.shape == (1, 44, 330)

  def test_png_multi_load(self):
    loader = dc.data.ImageLoader()
    dataset = loader.featurize([self.face_path, self.face_copy_path])
    assert dataset.X.shape == (2, 768, 1024, 3)

  def test_zip_load(self):
  def test_png_zip_load(self):
    loader = dc.data.ImageLoader()
    dataset = loader.featurize(self.zip_path)
    assert dataset.X.shape == (1, 768, 1024, 3)

  def test_multi_zip_load(self):
  def test_png_multi_zip_load(self):
    loader = dc.data.ImageLoader()
    dataset = loader.featurize(self.multi_zip_path)
    assert dataset.X.shape == (2, 768, 1024, 3)

  def test_multitype_zip_load(self):
    loader = dc.data.ImageLoader()
    dataset = loader.featurize(self.multitype_zip_path)
    # Since the different files have different shapes, makes an object array
    assert dataset.X.shape == (2,)

  def test_directory_load(self):
    loader = dc.data.ImageLoader()
    dataset = loader.featurize(self.image_dir)
    assert dataset.X.shape == (2, 768, 1024, 3)
+1 −0
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@ from __future__ import division
from __future__ import unicode_literals

from deepchem.molnet.load_function.bace_datasets import load_bace_classification, load_bace_regression
from deepchem.molnet.load_function.bbbc_datasets import load_bbbc001
from deepchem.molnet.load_function.bbbp_datasets import load_bbbp
from deepchem.molnet.load_function.cell_counting_datasets import load_cell_counting
from deepchem.molnet.load_function.chembl_datasets import load_chembl
+75 −0
Original line number Diff line number Diff line
"""
BBBC Dataset loader.

This file contains image loaders for the BBBC dataset collection (https://data.broadinstitute.org/bbbc/image_sets.html).
"""

from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import logging
import deepchem

logger = logging.getLogger(__name__)


def load_bbbc001(split='index', reload=True):
  """Load BBBC001 dataset
  
  This dataset contains 6 images of human HT29 colon cancer cells. The task is to learn to predict the cell counts in these images. This dataset is too small to serve to train algorithms, but might serve as a good test dataset. https://data.broadinstitute.org/bbbc/BBBC001/
  """
  # Featurize BBBC001 dataset
  bbbc001_tasks = ["cell-count"]
  data_dir = deepchem.utils.get_data_dir()
  if reload:
    save_dir = os.path.join(data_dir, "bbbc001/" + str(split))
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return bbbc001_tasks, all_dataset, transformers
  dataset_file = os.path.join(data_dir, "BBBC001_v1_images_tif.zip")
  labels_file = os.path.join(data_dir, "BBBC001_v1_counts.txt")

  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'https://data.broadinstitute.org/bbbc/BBBC001/BBBC001_v1_images_tif.zip'
    )
  if not os.path.exists(labels_file):
    deepchem.utils.download_url(
        'https://data.broadinstitute.org/bbbc/BBBC001/BBBC001_v1_counts.txt')
  # Featurize Images into NumpyArrays
  loader = deepchem.data.ImageLoader()
  dataset = loader.featurize(dataset_file, in_memory=False)

  # Load text file with labels
  with open(labels_file) as f:
    content = f.readlines()
  # Strip the first line which holds field labels
  lines = [x.strip() for x in content][1:]
  # Format is: Image_name count1 count2
  lines = [x.split("\t") for x in lines]
  counts = [(float(x[1]) + float(x[2])) / 2.0 for x in lines]
  y = np.array(counts)

  # This is kludgy way to add y to dataset. Can be done better?
  dataset = deepchem.data.DiskDataset.from_numpy(dataset.X, y)

  if split == None:
    return bbbc001_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
  }
  if split not in splitters:
    raise ValueError("Only index and random splits supported.")
  splitter = splitters[split]

  train, valid, test = splitter.train_valid_test_split(dataset)
  all_dataset = (train, valid, test)
  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                             transformers)
  return bbbc001_tasks, all_dataset, transformers
Loading