Commit 5d64e281 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

changes

parent 2b76c72b
Loading
Loading
Loading
Loading
+64 −28
Original line number Diff line number Diff line
@@ -19,7 +19,7 @@ from deepchem.utils.save import load_csv_files, load_json_files
from deepchem.utils.save import load_sdf_files
from deepchem.utils.genomics import encode_fasta_sequence
from deepchem.feat import UserDefinedFeaturizer, Featurizer
from deepchem.data import DiskDataset, NumpyDataset, ImageDataset
from deepchem.data import Dataset, DiskDataset, NumpyDataset, ImageDataset
import zipfile

logger = logging.getLogger(__name__)
@@ -278,7 +278,7 @@ class DataLoader(object):
  def create_dataset(self,
                     inputs: Sequence[Any],
                     data_dir: Optional[str] = None,
                     shard_size: int = 8192) -> DiskDataset:
                     shard_size: Optional[int] = 8192) -> Dataset:
    """Creates and returns a `Dataset` object by featurizing provided files.

    Reads in `inputs` and uses `self.featurizer` to featurize the
@@ -306,7 +306,7 @@ class DataLoader(object):
    from `inputs`.
    """
    logger.info("Loading raw samples now.")
    logger.info("shard_size: %d" % shard_size)
    logger.info("shard_size: %s" % str(shard_size))

    if not isinstance(inputs, list):
      inputs = [inputs]
@@ -527,7 +527,7 @@ class JsonLoader(DataLoader):
  def create_dataset(self,
                     input_files: OneOrMany[str],
                     data_dir: Optional[str] = None,
                     shard_size: int = 8192) -> DiskDataset:
                     shard_size: Optional[int] = 8192) -> DiskDataset:
    """Creates a `Dataset` from input JSON files.

    Parameters
@@ -731,7 +731,7 @@ class FASTALoader(DataLoader):
    A `Dataset` object containing a featurized representation of data
    from `input_files`.
    """
    if not isinstance(input_files, list):
    if isinstance(input_files, str):
      input_files = [input_files]

    def shard_generator():
@@ -768,25 +768,31 @@ class ImageLoader(DataLoader):
      tasks = []
    self.tasks = tasks

  def create_dataset(
      self,
      input_files: OneOrMany[str],
      labels: Optional[np.ndarray],
      weights: Optional[np.ndarray],
  def create_dataset(self,
                     inputs: Union[OneOrMany[str], Tuple[Any]],
                     data_dir: Optional[str] = None,
      in_memory: bool = False) -> Union[NumpyDataset, ImageDataset]:
                     shard_size: Optional[int] = 8192,
                     in_memory: bool = False) -> Dataset:
    """Creates and returns a `Dataset` object by featurizing provided image files and labels/weights.

    Parameters
    ----------
    input_files: list
      Each file in this list should either be of a supported
      image format (.png, .tif only for now) or of a compressed
      folder of image files (only .zip for now).
    labels: optional
      If provided, a numpy ndarray of image labels
    weights: optional
      If provided, a numpy ndarray of image weights
    inputs: `Union[OneOrMany[str], Tuple[Any]]`
      The inputs provided should be one of the following

      - filename
      - list of filenames
      - Tuple (list of filenames, labels)
      - Tuple (list of filenames, labels, weights)

      Each file in a given list of filenames should either be of a supported
      image format (.png, .tif only for now) or of a compressed folder of
      image files (only .zip for now). If `labels` or `weights` are provided,
      they must correspond to the sorted order of all filenames provided, with
      one label/weight per file.

    data_dir: str, optional
      Directory to store featurized dataset.
    in_memory: bool
      If true, return in-memory NumpyDataset. Else return ImageDataset.

@@ -794,8 +800,23 @@ class ImageLoader(DataLoader):
    -------
    A `Dataset` object containing a featurized representation of data
    from `input_files`, `labels`, and `weights`.

    """
    if not isinstance(input_files, list):
    labels, weights = None, None
    if isinstance(inputs, tuple):
      if len(inputs) == 1:
        input_files = inputs[0]
        if isinstance(inputs, str):
          input_files = [inputs]
      elif len(inputs) == 2:
        input_files, labels = inputs
      elif len(inputs) == 3:
        input_files, labels, weights = inputs
      else:
        raise ValueError("Input must be a tuple of length 1, 2, or 3")
    else:
      input_files = inputs
    if isinstance(input_files, str):
      input_files = [input_files]

    image_files = []
@@ -831,14 +852,29 @@ class ImageLoader(DataLoader):
          raise ValueError("Unsupported file format")
      input_files = remainder

    # Sort image files
    image_files = sorted(image_files)

    if in_memory:
      if data_dir is None:
        return NumpyDataset(
            self.load_img(image_files), y=labels, w=weights, ids=image_files)
      else:
        dataset = DiskDataset.from_numpy(
            self.load_img(image_files),
            y=labels,
            w=weights,
            ids=image_files,
            tasks=self.tasks,
            data_dir=data_dir)
        if shard_size is not None:
          dataset.reshard(shard_size)
        return dataset
    else:
      return ImageDataset(image_files, y=labels, w=weights, ids=image_files)

  @staticmethod
  def load_img(image_files):
  def load_img(image_files) -> np.ndarray:
    """Loads a set of images from disk.

    Parameters
@@ -848,7 +884,7 @@ class ImageLoader(DataLoader):

    Returns
    -------
    np.ndarray of that contains loaded images. Of shape `(N,...)`.
    np.ndarray that contains loaded images. Of shape `(N,...)`.

    Note
    ----
@@ -924,7 +960,7 @@ class InMemoryLoader(DataLoader):
  def create_dataset(self,
                     inputs: Sequence[Any],
                     data_dir: Optional[str] = None,
                     shard_size: int = 8192) -> DiskDataset:
                     shard_size: Optional[int] = 8192) -> DiskDataset:
    """Creates and returns a `Dataset` object by featurizing provided files.

    Reads in `inputs` and uses `self.featurizer` to featurize the
@@ -939,7 +975,7 @@ class InMemoryLoader(DataLoader):

    Parameters
    ----------
    inputs: list
    inputs: Sequence[Any]
      List of inputs to process. Entries can be filenames or arbitrary objects.
    data_dir: str, optional
      Directory to store featurized dataset.
@@ -952,7 +988,7 @@ class InMemoryLoader(DataLoader):
    from `inputs`.
    """
    logger.info("Loading raw samples now.")
    logger.info("shard_size: %d" % shard_size)
    logger.info("shard_size: %s" % str(shard_size))

    if not isinstance(inputs, list):
      try:
+0 −3
Original line number Diff line number Diff line
@@ -1090,9 +1090,6 @@ class DiskDataset(Dataset):
    Gets learning tasks associated with this dataset.
    """
    return self.tasks
    # if not len(self.metadata_df):
    #  raise ValueError("No data in dataset.")
    # return next(self.metadata_df.iterrows())[1]['task_names']

  def reshard(self, shard_size: int) -> None:
    """Reshards data to have specified shard size."""
+8 −0
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@ import tempfile
from scipy import misc
import deepchem as dc
import zipfile
import numpy as np


class TestImageLoader(unittest.TestCase):
@@ -62,6 +63,13 @@ class TestImageLoader(unittest.TestCase):
    # These are the known dimensions of face.png
    assert dataset.X.shape == (1, 768, 1024, 3)

  def test_png_simple_load_with_labels(self):
    loader = dc.data.ImageLoader()
    dataset = loader.featurize((self.face_path, np.array(1)))
    # These are the known dimensions of face.png
    assert dataset.X.shape == (1, 768, 1024, 3)
    assert (dataset.y == np.ones((1,))).all()

  def test_tif_simple_load(self):
    loader = dc.data.ImageLoader()
    dataset = loader.featurize(self.tif_image_path)