Commit cf989416 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Changes

parent 88d8bd25
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -18,3 +18,4 @@ from deepchem.data.data_loader import JsonLoader
from deepchem.data.data_loader import SDFLoader
from deepchem.data.data_loader import FASTALoader
from deepchem.data.data_loader import ImageLoader
from deepchem.data.data_loader import InMemoryLoader
+224 −19
Original line number Diff line number Diff line
@@ -85,6 +85,10 @@ def _featurize_smiles_df(df, featurizer, field, log_every_n=1000):
    The name of a column in `df` that holds SMILES strings
  log_every_n: int, optional (default 1000)
    Emit a logging statement every `log_every_n` rows.

  Note
  ----
  This function requires RDKit to be installed
  """
  sample_elems = df[field].tolist()

@@ -238,7 +242,7 @@ class DataLoader(object):
    self.featurizer = featurizer
    self.log_every_n = log_every_n

  def featurize(self, input_files, data_dir=None, shard_size=8192):
  def featurize(self, inputs, data_dir=None, shard_size=8192):
    """Featurize provided files and write to specified location.

    DEPRECATED: This method is now a wrapper for `create_dataset()`
@@ -253,8 +257,8 @@ class DataLoader(object):

    Parameters
    ----------
    input_files: list
      List of input filenames.
    inputs: list
      List of inputs to process. Entries can be filenames or arbitrary objects.
    data_dir: str, optional
      Directory to store featurized dataset.
    shard_size: int, optional
@@ -263,17 +267,17 @@ class DataLoader(object):
    Returns
    -------
    A `Dataset` object containing a featurized representation of data
    from `input_files`.
    from `input`.
    """
    warnings.warn(
        "featurize() is deprecated and has been renamed to create_dataset(). featurize() will be removed in DeepChem 3.0",
        FutureWarning)
    return self.create_dataset(input_files, data_dir, shard_size)
    return self.create_dataset(inputs, data_dir, shard_size)

  def create_dataset(self, input_files, data_dir=None, shard_size=8192):
  def create_dataset(self, inputs, data_dir=None, shard_size=8192):
    """Creates and returns a `Dataset` object by featurizing provided files.

    Reads in `input_files` and uses `self.featurizer` to featurize the
    Reads in `inputs` and uses `self.featurizer` to featurize the
    data in these input files.  For large files, automatically shards
    into smaller chunks of `shard_size` datapoints for convenience.
    Returns a `Dataset` object that contains the featurized dataset.
@@ -285,8 +289,8 @@ class DataLoader(object):

    Parameters
    ----------
    input_files: list
      List of input filenames.
    inputs: list
      List of inputs to process. Entries can be filenames or arbitrary objects.
    data_dir: str, optional
      Directory to store featurized dataset.
    shard_size: int, optional
@@ -295,17 +299,16 @@ class DataLoader(object):
    Returns
    -------
    A `Dataset` object containing a featurized representation of data
    from `input_files`.
    from `inputs`.
    """
    logger.info("Loading raw samples now.")
    logger.info("shard_size: %d" % shard_size)

    if not isinstance(input_files, list):
      input_files = [input_files]
    if not isinstance(inputs, list):
      inputs = [inputs]

    def shard_generator():
      for shard_num, shard in enumerate(
          self._get_shards(input_files, shard_size)):
      for shard_num, shard in enumerate(self._get_shards(inputs, shard_size)):
        time1 = time.time()
        X, valid_inds = self._featurize_shard(shard)
        ids = shard[self.id_field].values
@@ -329,11 +332,11 @@ class DataLoader(object):

    return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)

  def _get_shards(self, input_files, shard_size):
  def _get_shards(self, inputs, shard_size):
    """Stub for children classes.

    Should implement a generator that walks over the source data in
    `input_files` and returns a "shard" at a time. Here a shard is a
    `inputs` and returns a "shard" at a time. Here a shard is a
    chunk of input data that can reasonably be handled in memory. For
    example, this may be a set of rows from a CSV file or a set of
    molecules from a SDF file. To re-use the
@@ -345,8 +348,8 @@ class DataLoader(object):
    
    Parameters
    ----------
    input_files: list
      List of input filenames.
    inputs: list
      List of inputs to process. Entries can be filenames or arbitrary objects.
    shard_size: int, optional
      Number of examples stored in each shard.
    """
@@ -411,7 +414,15 @@ class CSVLoader(DataLoader):
    self.log_every_n = log_every_n

  def _get_shards(self, input_files, shard_size):
    """Defines a generator which returns data for each shard"""
    """Defines a generator which returns data for each shard

    Parameters
    ----------
    input_files: list[str]
      List of filenames to process
    shard_size: int
      The size of a shard of data to process at a time.
    """
    return load_csv_files(input_files, shard_size)

  def _featurize_shard(self, shard):
@@ -812,6 +823,21 @@ class ImageLoader(DataLoader):

  @staticmethod
  def load_img(image_files):
    """Loads a set of images from disk.

    Parameters
    ----------
    image_files: list[str]
      List of image filenames to load

    Returns
    -------
    np.ndarray of that contains loaded images. Of shape `(N,...)`.

    Note
    ----
    This method requires PIL to be installed.
    """
    from PIL import Image
    images = []
    for image_file in image_files:
@@ -827,3 +853,182 @@ class ImageLoader(DataLoader):
      else:
        raise ValueError("Unsupported image filetype for %s" % image_file)
    return np.array(images)


class InMemoryLoader(DataLoader):
  """Facilitate Featurization of In-memory objects.

  When featurizing a dataset, it's often the case that the initial set of
  data (pre-featurization) fits handily within memory. (For example, perhaps
  it fits within a column of a pandas DataFrame.) In this case, it would be
  convenient to directly be able to featurize this column of data. However,
  the process of featurization often generates large arrays which quickly eat
  up available memory. This class provides convenient capabilities to process
  such in-memory data by checkpointing generated features periodically to
  disk.

  Example
  -------
  Here's an example with only datapoints and no labels or weights.

  >>> import deepchem as dc
  >>> smiles = ["C", "CC", "CCC", "CCCC"]
  >>> featurizer = dc.feat.CircularFingerprint()
  >>> loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
  >>> dataset = loader.create_dataset(smiles, shard_size=2)
  >>> len(dataset)
  4

  Here's an example with both datapoints and labels
  
  >>> import deepchem as dc
  >>> smiles = ["C", "CC", "CCC", "CCCC"]
  >>> labels = [1, 0, 1, 0]
  >>> featurizer = dc.feat.CircularFingerprint()
  >>> loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
  >>> dataset = loader.create_dataset(zip(smiles, labels), shard_size=2)
  >>> len(dataset)
  4

  Here's an example with datapoints, labels, weights and ids all provided.

  >>> import deepchem as dc
  >>> smiles = ["C", "CC", "CCC", "CCCC"]
  >>> labels = [1, 0, 1, 0]
  >>> weights = [1.5, 0, 1.5, 0]
  >>> ids = ["C", "CC", "CCC", "CCCC"]
  >>> featurizer = dc.feat.CircularFingerprint()
  >>> loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
  >>> dataset = loader.create_dataset(zip(smiles, labels, weights, ids), shard_size=2)
  >>> len(dataset)
  4

  """

  def create_dataset(self, inputs, data_dir=None, shard_size=8192):
    """Creates and returns a `Dataset` object by featurizing provided files.

    Reads in `inputs` and uses `self.featurizer` to featurize the
    data in these input files.  For large files, automatically shards
    into smaller chunks of `shard_size` datapoints for convenience.
    Returns a `Dataset` object that contains the featurized dataset.

    This implementation assumes that the helper methods `_get_shards`
    and `_featurize_shard` are implemented and that each shard
    returned by `_get_shards` is a pandas dataframe.  You may choose
    to reuse or override this method in your subclass implementations.

    Parameters
    ----------
    inputs: list
      List of inputs to process. Entries can be filenames or arbitrary objects.
    data_dir: str, optional
      Directory to store featurized dataset.
    shard_size: int, optional
      Number of examples stored in each shard.

    Returns
    -------
    A `Dataset` object containing a featurized representation of data
    from `inputs`.
    """
    logger.info("Loading raw samples now.")
    logger.info("shard_size: %d" % shard_size)

    if not isinstance(inputs, list):
      try:
        inputs = list(inputs)
      except TypeError:
        inputs = [inputs]

    def shard_generator():
      global_index = 0
      for shard_num, shard in enumerate(self._get_shards(inputs, shard_size)):
        time1 = time.time()
        X, y, w, ids = self._featurize_shard(shard, global_index)
        global_index += len(shard)

        time2 = time.time()
        logger.info("TIMING: featurizing shard %d took %0.3f s" %
                    (shard_num, time2 - time1))
        yield X, y, w, ids

    return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)

  def _get_shards(self, inputs, shard_size):
    """Break up input into shards.

    Parameters
    ----------
    inputs: list[object]
      Each entry in this list must be of the form `(featurization_input,
      label, weight, id)` or `(featurization_input, label, weight)` or
      `(featurization_input, label)` or `featurization_input` for one
      datapoint, where `featurization_input` is any input that is recognized
      by `self.featurizer`.
    shard_size: int
      The size of shard to generate.

    Returns
    -------
    Iterator which iterates over shards of data.
    """
    current_shard = []
    for i, datapoint in enumerate(inputs):
      if i != 0 and i % shard_size == 0:
        shard_data = current_shard
        current_shard = []
        yield shard_data
      current_shard.append(datapoint)
    yield current_shard

  def _featurize_shard(self, shard, global_index):
    """Featurizes a shard of an input data.

    Parameters
    ----------
    shard: list
      List each entry of which must be of the form `(featurization_input,
      label, weight, id)` or `(featurization_input, label, weight)` or
      `(featurization_input, label)` or `featurization_input` for one
      datapoint, where `featurization_input` is any input that is recognized
      by `self.featurizer`.
    global_index: int
      The starting index for this shard in the full set of provided inputs
    """
    features = []
    labels = []
    weights = []
    ids = []
    n_tasks = len(self.tasks)
    for i, entry in enumerate(shard):
      if not isinstance(entry, tuple):
        entry = (entry,)
      if len(entry) > 4:
        raise ValueError(
            "Entry is malformed and must be of length 1-4 containing featurization_input and optionally label, weight, and id."
        )
      if len(entry) == 4:
        featurization_input, label, weight, entry_id = entry
      elif len(entry) == 3:
        featurization_input, label, weight = entry
        entry_id = global_index + i
      elif len(entry) == 2:
        featurization_input, label = entry
        weight = np.ones((n_tasks), np.float32)
        entry_id = global_index + i
      elif len(entry) == 1:
        featurization_input = entry
        label = np.zeros((n_tasks), np.float32)
        weight = np.zeros((n_tasks), np.float32)
        entry_id = global_index + i
      feature = self.featurizer(featurization_input)
      features.append(feature)
      weights.append(weight)
      labels.append(label)
      ids.append(entry_id)
    X = np.concatenate(features, axis=0)
    y = np.array(labels)
    w = np.array(weights)
    ids = np.array(ids)
    return X, y, w, ids
+58 −0
Original line number Diff line number Diff line
import deepchem as dc
import numpy as np


def test_inmemory_features():
  smiles = ["C", "CC", "CCC", "CCCC"]
  featurizer = dc.feat.CircularFingerprint(size=1024)
  loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
  dataset = loader.create_dataset(smiles, shard_size=2)
  assert len(dataset) == 4
  assert dataset.X.shape == (4, 1024)
  assert dataset.get_number_shards() == 2
  assert (dataset.ids == np.arange(4)).all()


def test_inmemory_features_and_labels():
  smiles = ["C", "CC", "CCC", "CCCC"]
  labels = [1, 0, 1, 0]
  featurizer = dc.feat.CircularFingerprint(size=1024)
  loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
  dataset = loader.create_dataset(zip(smiles, labels), shard_size=2)
  assert len(dataset) == 4
  assert dataset.X.shape == (4, 1024)
  assert (dataset.y == np.array(labels)).all()
  assert dataset.get_number_shards() == 2
  assert (dataset.ids == np.arange(4)).all()


def test_inmemory_features_and_labels_and_weights():
  smiles = ["C", "CC", "CCC", "CCCC"]
  labels = [1, 0, 1, 0]
  weights = [1.5, 1.5, 1, 1]
  featurizer = dc.feat.CircularFingerprint(size=1024)
  loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
  dataset = loader.create_dataset(zip(smiles, labels, weights), shard_size=2)
  assert len(dataset) == 4
  assert dataset.X.shape == (4, 1024)
  assert (dataset.y == np.array(labels)).all()
  assert (dataset.w == np.array(weights)).all()
  assert (dataset.ids == np.arange(4)).all()
  assert dataset.get_number_shards() == 2


def test_inmemory_features_and_labels_and_weights_and_ids():
  smiles = ["C", "CC", "CCC", "CCCC"]
  labels = [1, 0, 1, 0]
  weights = [1.5, 1.5, 1, 1]
  ids = smiles
  featurizer = dc.feat.CircularFingerprint(size=1024)
  loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
  dataset = loader.create_dataset(
      zip(smiles, labels, weights, ids), shard_size=2)
  assert len(dataset) == 4
  assert dataset.X.shape == (4, 1024)
  assert (dataset.y == np.array(labels)).all()
  assert (dataset.w == np.array(weights)).all()
  assert (dataset.ids == np.array(ids)).all()
  assert dataset.get_number_shards() == 2
+9 −0
Original line number Diff line number Diff line
@@ -183,6 +183,8 @@ class MolecularFeaturizer(Featurizer):
    """
    try:
      from rdkit import Chem
      from rdkit.Chem import rdmolfiles
      from rdkit.Chem import rdmolops
      from rdkit.Chem.rdchem import Mol
    except ModuleNotFoundError:
      raise ValueError("This class requires RDKit to be installed.")
@@ -201,6 +203,13 @@ class MolecularFeaturizer(Featurizer):
        if isinstance(mol, str):
          # mol must be a SMILES string so parse
          mol = Chem.MolFromSmiles(mol)
          # TODO (ytz) this is a bandage solution to reorder the atoms
          # so that they're always in the same canonical order.
          # Presumably this should be correctly implemented in the
          # future for graph mols.
          if mol:
            new_order = rdmolfiles.CanonicalRankAtoms(mol)
            mol = rdmolops.RenumberAtoms(mol, new_order)
        features.append(self._featurize(mol))
      except:
        logger.warning(
+55 −13
Original line number Diff line number Diff line
@@ -45,21 +45,32 @@ def get_input_type(input_file):
    raise ValueError("Unrecognized extension %s" % file_extension)


def load_data(input_files, shard_size=None, verbose=True):
def load_data(input_files, shard_size=None):
  """Loads data from disk.

  For CSV files, supports sharded loading for large files.

  Parameters
  ----------
  input_files: list
    List of filenames.
  shard_size: int, optional (default None)
    Size of shard to yield

  Returns
  -------
  Iterator which iterates over provided files.
  """
  if not len(input_files):
    return
  input_type = get_input_type(input_files[0])
  if input_type == "sdf":
    if shard_size is not None:
      logger.info("Ignoring shard_size for sdf input.", verbose)
      logger.info("Ignoring shard_size for sdf input.")
    for value in load_sdf_files(input_files):
      yield value
  elif input_type == "csv":
    for value in load_csv_files(input_files, shard_size, verbose=verbose):
    for value in load_csv_files(input_files, shard_size):
      yield value
  elif input_type == "pandas-pickle":
    for input_file in input_files:
@@ -67,7 +78,29 @@ def load_data(input_files, shard_size=None, verbose=True):


def load_sdf_files(input_files, clean_mols, tasks=[]):
  """Load SDF file into dataframe."""
  """Load SDF file into dataframe.

  Parameters
  ----------
  input_files: list[str]
    List of filenames
  clean_mols: bool
    Whether to sanitize molecules.
  tasks: list, optional (default [])
    Each entry in `tasks` is treated as a property in the SDF file and is
    retrieved with `mol.GetProp(str(task))` where `mol` is the RDKit mol
    loaded from a given SDF entry.

  Note
  ----
  This function requires RDKit to be installed.

  Returns
  -------
  dataframes: list
    This function returns a list of pandas dataframes. Each dataframe will
    columns `('mol_id', 'smiles', 'mol')`.
  """
  from rdkit import Chem
  dataframes = []
  for input_file in input_files:
@@ -97,19 +130,30 @@ def load_sdf_files(input_files, clean_mols, tasks=[]):
  return dataframes


def load_csv_files(filenames, shard_size=None, verbose=True):
  """Load data as pandas dataframe."""
def load_csv_files(filenames, shard_size=None):
  """Load data as pandas dataframe.

  Parameters
  ----------
  input_files: list[str]
    List of filenames
  shard_size: int, optional (default None) 
    The shard size to yield at one time.

  Returns
  -------
  Iterator which iterates over shards of data.
  """
  # First line of user-specified CSV *must* be header.
  shard_num = 1
  for filename in filenames:
    if shard_size is None:
      yield pd.read_csv(filename)
    else:
      logger.info("About to start loading CSV from %s" % filename, verbose)
      logger.info("About to start loading CSV from %s" % filename)
      for df in pd.read_csv(filename, chunksize=shard_size):
        logger.info(
            "Loading shard %d of size %s." % (shard_num, str(shard_size)),
            verbose)
            "Loading shard %d of size %s." % (shard_num, str(shard_size)))
        df = df.replace(np.nan, str(""), regex=True)
        shard_num += 1
        yield df
@@ -227,8 +271,8 @@ def encode_bio_sequence(fname, file_type="fasta", letters="ATCGN"):


def save_metadata(tasks, metadata_df, data_dir):
  """
  Saves the metadata for a DiskDataset
  """Saves the metadata for a DiskDataset

  Parameters
  ----------
  tasks: list of str
@@ -236,8 +280,6 @@ def save_metadata(tasks, metadata_df, data_dir):
  metadata_df: pd.DataFrame
  data_dir: str
    Directory to store metadata
  Returns
  -------
  """
  if isinstance(tasks, np.ndarray):
    tasks = tasks.tolist()
Loading