Commit 9fb18d53 authored by peastman's avatar peastman
Browse files

ImageLoader.featurize() returns an ImageDataset

parent 1cb76ba4
Loading
Loading
Loading
Loading
+14 −27
Original line number Diff line number Diff line
@@ -20,8 +20,7 @@ from deepchem.utils.save import load_csv_files
from deepchem.utils.save import load_sdf_files
from deepchem.utils.genomics import encode_fasta_sequence
from deepchem.feat import UserDefinedFeaturizer
from deepchem.data import DiskDataset
from deepchem.data import NumpyDataset
from deepchem.data import DiskDataset, NumpyDataset, ImageDataset
from scipy import misc
import zipfile
from PIL import Image
@@ -230,7 +229,8 @@ class DataLoader(object):
          assert len(X) == len(ids)

        time2 = time.time()
        log("TIMING: featurizing shard %d took %0.3f s" %
        log(
            "TIMING: featurizing shard %d took %0.3f s" %
            (shard_num, time2 - time1), self.verbose)
        yield X, y, w, ids

@@ -294,7 +294,8 @@ class SDFLoader(DataLoader):

  def featurize_shard(self, shard):
    """Featurizes a shard of an input dataframe."""
    log("Currently featurizing feature_type: %s" %
    log(
        "Currently featurizing feature_type: %s" %
        self.featurizer.__class__.__name__, self.verbose)
    return featurize_mol_df(shard, self.featurizer, field=self.mol_field)

@@ -347,12 +348,7 @@ class ImageLoader(DataLoader):
      tasks = []
    self.tasks = tasks

  def featurize(self,
                input_files,
                labels=None,
                weights=None,
                read_img=True,
                in_memory=True):
  def featurize(self, input_files, labels=None, weights=None, in_memory=False):
    """Featurizes image files.

    Parameters
@@ -362,7 +358,7 @@ class ImageLoader(DataLoader):
      (.png, .tif only for now) or of a compressed folder of image files
      (only .zip for now).
    in_memory: bool
      If true, return in-memory NumpyDataset. Else return DiskDataset.
      If true, return in-memory NumpyDataset. Else return ImageDataset.
    """
    if not isinstance(input_files, list):
      input_files = [input_files]
@@ -398,20 +394,11 @@ class ImageLoader(DataLoader):
          raise ValueError("Unsupported file format")
      input_files = remainder

    if read_img:
      X = self.load_img(image_files)
    else:
      X = [None] * len(image_files)
    if in_memory:
      return NumpyDataset(X, y=labels, w=weights, ids=image_files)

      return NumpyDataset(
          self.load_img(image_files), y=labels, w=weights, ids=image_files)
    else:
      # from_numpy currently requires labels. Make dummy labels
      if labels is None:
        labels = np.zeros((len(image_files), 1))
      if weights is None:
        weights = np.zeros((len(image_files), 1))
      return DiskDataset.from_numpy(X, labels, w=weights, ids=image_files)
      return ImageDataset(image_files, y=labels, w=weights, ids=image_files)

  @staticmethod
  def load_img(image_files):
+3 −1
Original line number Diff line number Diff line
@@ -1359,9 +1359,11 @@ class ImageDataset(Dataset):
    ids: ndarray
      the sample IDs
    """
    n_samples = len(X)
    if y is None:
      y = np.zeros((n_samples,))
    self._X_shape = self._find_array_shape(X)
    self._y_shape = self._find_array_shape(y)
    n_samples = len(X)
    if w is None:
      w = np.ones(self._y_shape[:2])
    if ids is None:
+6 −3
Original line number Diff line number Diff line
@@ -23,7 +23,8 @@ class TestImageDataset(test_util.TensorFlowTestCase):
  def test_load_images(self):
    """Test that ImageDataset loads images."""

    files = [os.path.join('images', f) for f in os.listdir('images')]
    path = os.path.join(os.path.dirname(__file__), 'images')
    files = [os.path.join(path, f) for f in os.listdir(path)]

    # First try using images for X.

@@ -55,7 +56,8 @@ class TestImageDataset(test_util.TensorFlowTestCase):
  def test_itersamples(self):
    """Test iterating samples of an ImageDataset."""

    files = [os.path.join('images', f) for f in os.listdir('images')]
    path = os.path.join(os.path.dirname(__file__), 'images')
    files = [os.path.join(path, f) for f in os.listdir(path)]
    ds = dc.data.ImageDataset(files, np.random.random(10))
    X = ds.X
    i = 0
@@ -70,7 +72,8 @@ class TestImageDataset(test_util.TensorFlowTestCase):
  def test_iterbatches(self):
    """Test iterating batches of an ImageDataset."""

    files = [os.path.join('images', f) for f in os.listdir('images')]
    path = os.path.join(os.path.dirname(__file__), 'images')
    files = [os.path.join(path, f) for f in os.listdir(path)]
    ds = dc.data.ImageDataset(files, np.random.random(10))
    X = ds.X
    iterated_ids = set()