Unverified Commit bebb5562 authored by Nathan Frey's avatar Nathan Frey Committed by GitHub
Browse files

Merge branch 'master' into material_featurizer_renames

parents fd42a76e 53366e7d
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -18,3 +18,4 @@ from deepchem.data.data_loader import JsonLoader
from deepchem.data.data_loader import SDFLoader
from deepchem.data.data_loader import FASTALoader
from deepchem.data.data_loader import ImageLoader
from deepchem.data.data_loader import InMemoryLoader
+311 −47
Original line number Diff line number Diff line
@@ -12,13 +12,14 @@ import time
import sys
import logging
import warnings
from typing import List, Optional, Dict, Tuple
from typing import List, Optional, Dict, Tuple, Any, Sequence, Union

from deepchem.utils.typing import OneOrMany
from deepchem.utils.save import load_csv_files, load_json_files
from deepchem.utils.save import load_sdf_files
from deepchem.utils.genomics import encode_fasta_sequence
from deepchem.feat import UserDefinedFeaturizer, Featurizer
from deepchem.data import DiskDataset, NumpyDataset, ImageDataset
from deepchem.data import Dataset, DiskDataset, NumpyDataset, ImageDataset
import zipfile

logger = logging.getLogger(__name__)
@@ -85,6 +86,10 @@ def _featurize_smiles_df(df, featurizer, field, log_every_n=1000):
    The name of a column in `df` that holds SMILES strings
  log_every_n: int, optional (default 1000)
    Emit a logging statement every `log_every_n` rows.

  Note
  ----
  This function requires RDKit to be installed
  """
  sample_elems = df[field].tolist()

@@ -238,7 +243,10 @@ class DataLoader(object):
    self.featurizer = featurizer
    self.log_every_n = log_every_n

  def featurize(self, input_files, data_dir=None, shard_size=8192):
  def featurize(self,
                inputs: Sequence[Any],
                data_dir: Optional[str] = None,
                shard_size: Optional[int] = 8192) -> Dataset:
    """Featurize provided files and write to specified location.

    DEPRECATED: This method is now a wrapper for `create_dataset()`
@@ -253,8 +261,8 @@ class DataLoader(object):

    Parameters
    ----------
    input_files: list
      List of input filenames.
    inputs: list
      List of inputs to process. Entries can be filenames or arbitrary objects.
    data_dir: str, optional
      Directory to store featurized dataset.
    shard_size: int, optional
@@ -263,18 +271,21 @@ class DataLoader(object):
    Returns
    -------
    A `Dataset` object containing a featurized representation of data
    from `input_files`.
    from `input`.
    """
    warnings.warn(
        "featurize() is deprecated and has been renamed to create_dataset(). featurize() will be removed in DeepChem 3.0",
        FutureWarning)
    return self.create_dataset(input_files, data_dir, shard_size)
    return self.create_dataset(inputs, data_dir, shard_size)

  def create_dataset(self, input_files, data_dir=None, shard_size=8192):
  def create_dataset(self,
                     inputs: Sequence[Any],
                     data_dir: Optional[str] = None,
                     shard_size: Optional[int] = 8192) -> Dataset:
    """Creates and returns a `Dataset` object by featurizing provided files.

    Reads in `input_files` and uses `self.featurizer` to featurize the
    data in these input files.  For large files, automatically shards
    Reads in `inputs` and uses `self.featurizer` to featurize the
    data in these inputs.  For large files, automatically shards
    into smaller chunks of `shard_size` datapoints for convenience.
    Returns a `Dataset` object that contains the featurized dataset.

@@ -285,8 +296,8 @@ class DataLoader(object):

    Parameters
    ----------
    input_files: list
      List of input filenames.
    inputs: list
      List of inputs to process. Entries can be filenames or arbitrary objects.
    data_dir: str, optional
      Directory to store featurized dataset.
    shard_size: int, optional
@@ -295,17 +306,16 @@ class DataLoader(object):
    Returns
    -------
    A `Dataset` object containing a featurized representation of data
    from `input_files`.
    from `inputs`.
    """
    logger.info("Loading raw samples now.")
    logger.info("shard_size: %d" % shard_size)
    logger.info("shard_size: %s" % str(shard_size))

    if not isinstance(input_files, list):
      input_files = [input_files]
    if not isinstance(inputs, list):
      inputs = [inputs]

    def shard_generator():
      for shard_num, shard in enumerate(
          self._get_shards(input_files, shard_size)):
      for shard_num, shard in enumerate(self._get_shards(inputs, shard_size)):
        time1 = time.time()
        X, valid_inds = self._featurize_shard(shard)
        ids = shard[self.id_field].values
@@ -329,11 +339,11 @@ class DataLoader(object):

    return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)

  def _get_shards(self, input_files, shard_size):
  def _get_shards(self, inputs, shard_size):
    """Stub for children classes.

    Should implement a generator that walks over the source data in
    `input_files` and returns a "shard" at a time. Here a shard is a
    `inputs` and returns a "shard" at a time. Here a shard is a
    chunk of input data that can reasonably be handled in memory. For
    example, this may be a set of rows from a CSV file or a set of
    molecules from a SDF file. To re-use the
@@ -345,8 +355,8 @@ class DataLoader(object):
    
    Parameters
    ----------
    input_files: list
      List of input filenames.
    inputs: list
      List of inputs to process. Entries can be filenames or arbitrary objects.
    shard_size: int, optional
      Number of examples stored in each shard.
    """
@@ -411,7 +421,15 @@ class CSVLoader(DataLoader):
    self.log_every_n = log_every_n

  def _get_shards(self, input_files, shard_size):
    """Defines a generator which returns data for each shard"""
    """Defines a generator which returns data for each shard

    Parameters
    ----------
    input_files: list[str]
      List of filenames to process
    shard_size: int
      The size of a shard of data to process at a time.
    """
    return load_csv_files(input_files, shard_size)

  def _featurize_shard(self, shard):
@@ -467,7 +485,7 @@ class JsonLoader(DataLoader):
  """

  def __init__(self,
               tasks: List[str],
               tasks: OneOrMany[str],
               feature_field: str,
               label_field: str = None,
               weight_field: str = None,
@@ -510,14 +528,14 @@ class JsonLoader(DataLoader):
    self.log_every_n = log_every_n

  def create_dataset(self,
                     input_files: List[str],
                     input_files: OneOrMany[str],
                     data_dir: Optional[str] = None,
                     shard_size: Optional[int] = 8192) -> DiskDataset:
    """Creates a `Dataset` from input JSON files.

    Parameters
    ----------
    input_files: List[str]
    input_files: OneOrMany[str]
      List of JSON filenames.
    data_dir: Optional[str], default None
      Name of directory where featurized data is stored.
@@ -531,9 +549,16 @@ class JsonLoader(DataLoader):
      from `input_files`.

    """

    if not isinstance(input_files, list):
      try:
        if isinstance(input_files, str):
          input_files = [input_files]
        else:
          input_files = list(input_files)
      except TypeError:
        raise ValueError(
            "input_files is of an unrecognized form. Must be one filename or a list of filenames."
        )

    def shard_generator():
      """Yield X, y, w, and ids for shards."""
@@ -685,7 +710,10 @@ class FASTALoader(DataLoader):
    """Initialize loader."""
    pass

  def create_dataset(self, input_files, data_dir=None, shard_size=None):
  def create_dataset(self,
                     input_files: OneOrMany[str],
                     data_dir: Optional[str] = None,
                     shard_size: Optional[int] = None) -> DiskDataset:
    """Creates a `Dataset` from input FASTA files.

    At present, FASTA support is limited and only allows for one-hot
@@ -706,7 +734,7 @@ class FASTALoader(DataLoader):
    A `Dataset` object containing a featurized representation of data
    from `input_files`.
    """
    if not isinstance(input_files, list):
    if isinstance(input_files, str):
      input_files = [input_files]

    def shard_generator():
@@ -728,7 +756,7 @@ class ImageLoader(DataLoader):
  traverse subdirectories which contain images.
  """

  def __init__(self, tasks=None):
  def __init__(self, tasks: OneOrMany[str] = None):
    """Initialize image loader.

    At present, custom image featurizers aren't supported by this
@@ -744,22 +772,30 @@ class ImageLoader(DataLoader):
    self.tasks = tasks

  def create_dataset(self,
                     input_files,
                     labels=None,
                     weights=None,
                     in_memory=False):
                     inputs: Union[OneOrMany[str], Tuple[Any]],
                     data_dir: Optional[str] = None,
                     shard_size: Optional[int] = 8192,
                     in_memory: bool = False) -> Dataset:
    """Creates and returns a `Dataset` object by featurizing provided image files and labels/weights.

    Parameters
    ----------
    input_files: list
      Each file in this list should either be of a supported
      image format (.png, .tif only for now) or of a compressed
      folder of image files (only .zip for now).
    labels: optional
      If provided, a numpy ndarray of image labels
    weights: optional
      If provided, a numpy ndarray of image weights
    inputs: `Union[OneOrMany[str], Tuple[Any]]`
      The inputs provided should be one of the following

      - filename
      - list of filenames
      - Tuple (list of filenames, labels)
      - Tuple (list of filenames, labels, weights)

      Each file in a given list of filenames should either be of a supported
      image format (.png, .tif only for now) or of a compressed folder of
      image files (only .zip for now). If `labels` or `weights` are provided,
      they must correspond to the sorted order of all filenames provided, with
      one label/weight per file.

    data_dir: str, optional
      Directory to store featurized dataset.
    in_memory: bool
      If true, return in-memory NumpyDataset. Else return ImageDataset.

@@ -767,8 +803,23 @@ class ImageLoader(DataLoader):
    -------
    A `Dataset` object containing a featurized representation of data
    from `input_files`, `labels`, and `weights`.

    """
    if not isinstance(input_files, list):
    labels, weights = None, None
    if isinstance(inputs, tuple):
      if len(inputs) == 1:
        input_files = inputs[0]
        if isinstance(inputs, str):
          input_files = [inputs]
      elif len(inputs) == 2:
        input_files, labels = inputs
      elif len(inputs) == 3:
        input_files, labels, weights = inputs
      else:
        raise ValueError("Input must be a tuple of length 1, 2, or 3")
    else:
      input_files = inputs
    if isinstance(input_files, str):
      input_files = [input_files]

    image_files = []
@@ -804,14 +855,44 @@ class ImageLoader(DataLoader):
          raise ValueError("Unsupported file format")
      input_files = remainder

    # Sort image files
    image_files = sorted(image_files)

    if in_memory:
      if data_dir is None:
        return NumpyDataset(
            self.load_img(image_files), y=labels, w=weights, ids=image_files)
      else:
        dataset = DiskDataset.from_numpy(
            self.load_img(image_files),
            y=labels,
            w=weights,
            ids=image_files,
            tasks=self.tasks,
            data_dir=data_dir)
        if shard_size is not None:
          dataset.reshard(shard_size)
        return dataset
    else:
      return ImageDataset(image_files, y=labels, w=weights, ids=image_files)

  @staticmethod
  def load_img(image_files):
  def load_img(image_files) -> np.ndarray:
    """Loads a set of images from disk.

    Parameters
    ----------
    image_files: list[str]
      List of image filenames to load

    Returns
    -------
    np.ndarray that contains loaded images. Of shape `(N,...)`.

    Note
    ----
    This method requires PIL to be installed.
    """
    from PIL import Image
    images = []
    for image_file in image_files:
@@ -827,3 +908,186 @@ class ImageLoader(DataLoader):
      else:
        raise ValueError("Unsupported image filetype for %s" % image_file)
    return np.array(images)


class InMemoryLoader(DataLoader):
  """Facilitate Featurization of In-memory objects.

  When featurizing a dataset, it's often the case that the initial set of
  data (pre-featurization) fits handily within memory. (For example, perhaps
  it fits within a column of a pandas DataFrame.) In this case, it would be
  convenient to directly be able to featurize this column of data. However,
  the process of featurization often generates large arrays which quickly eat
  up available memory. This class provides convenient capabilities to process
  such in-memory data by checkpointing generated features periodically to
  disk.

  Example
  -------
  Here's an example with only datapoints and no labels or weights.

  >>> import deepchem as dc
  >>> smiles = ["C", "CC", "CCC", "CCCC"]
  >>> featurizer = dc.feat.CircularFingerprint()
  >>> loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
  >>> dataset = loader.create_dataset(smiles, shard_size=2)
  >>> len(dataset)
  4

  Here's an example with both datapoints and labels
  
  >>> import deepchem as dc
  >>> smiles = ["C", "CC", "CCC", "CCCC"]
  >>> labels = [1, 0, 1, 0]
  >>> featurizer = dc.feat.CircularFingerprint()
  >>> loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
  >>> dataset = loader.create_dataset(zip(smiles, labels), shard_size=2)
  >>> len(dataset)
  4

  Here's an example with datapoints, labels, weights and ids all provided.

  >>> import deepchem as dc
  >>> smiles = ["C", "CC", "CCC", "CCCC"]
  >>> labels = [1, 0, 1, 0]
  >>> weights = [1.5, 0, 1.5, 0]
  >>> ids = ["C", "CC", "CCC", "CCCC"]
  >>> featurizer = dc.feat.CircularFingerprint()
  >>> loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
  >>> dataset = loader.create_dataset(zip(smiles, labels, weights, ids), shard_size=2)
  >>> len(dataset)
  4

  """

  def create_dataset(self,
                     inputs: Sequence[Any],
                     data_dir: Optional[str] = None,
                     shard_size: Optional[int] = 8192) -> DiskDataset:
    """Creates and returns a `Dataset` object by featurizing provided files.

    Reads in `inputs` and uses `self.featurizer` to featurize the
    data in these input files.  For large files, automatically shards
    into smaller chunks of `shard_size` datapoints for convenience.
    Returns a `Dataset` object that contains the featurized dataset.

    This implementation assumes that the helper methods `_get_shards`
    and `_featurize_shard` are implemented and that each shard
    returned by `_get_shards` is a pandas dataframe.  You may choose
    to reuse or override this method in your subclass implementations.

    Parameters
    ----------
    inputs: Sequence[Any]
      List of inputs to process. Entries can be arbitrary objects so long as
      they are understood by `self.featurizer`
    data_dir: str, optional
      Directory to store featurized dataset.
    shard_size: int, optional
      Number of examples stored in each shard.

    Returns
    -------
    A `Dataset` object containing a featurized representation of data
    from `inputs`.
    """
    logger.info("Loading raw samples now.")
    logger.info("shard_size: %s" % str(shard_size))

    if not isinstance(inputs, list):
      try:
        inputs = list(inputs)
      except TypeError:
        inputs = [inputs]

    def shard_generator():
      global_index = 0
      for shard_num, shard in enumerate(self._get_shards(inputs, shard_size)):
        time1 = time.time()
        X, y, w, ids = self._featurize_shard(shard, global_index)
        global_index += len(shard)

        time2 = time.time()
        logger.info("TIMING: featurizing shard %d took %0.3f s" %
                    (shard_num, time2 - time1))
        yield X, y, w, ids

    return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)

  def _get_shards(self, inputs, shard_size):
    """Break up input into shards.

    Parameters
    ----------
    inputs: list[object]
      Each entry in this list must be of the form `(featurization_input,
      label, weight, id)` or `(featurization_input, label, weight)` or
      `(featurization_input, label)` or `featurization_input` for one
      datapoint, where `featurization_input` is any input that is recognized
      by `self.featurizer`.
    shard_size: int
      The size of shard to generate.

    Returns
    -------
    Iterator which iterates over shards of data.
    """
    current_shard = []
    for i, datapoint in enumerate(inputs):
      if i != 0 and i % shard_size == 0:
        shard_data = current_shard
        current_shard = []
        yield shard_data
      current_shard.append(datapoint)
    yield current_shard

  def _featurize_shard(self, shard, global_index):
    """Featurizes a shard of an input data.

    Parameters
    ----------
    shard: list
      List each entry of which must be of the form `(featurization_input,
      label, weight, id)` or `(featurization_input, label, weight)` or
      `(featurization_input, label)` or `featurization_input` for one
      datapoint, where `featurization_input` is any input that is recognized
      by `self.featurizer`.
    global_index: int
      The starting index for this shard in the full set of provided inputs
    """
    features = []
    labels = []
    weights = []
    ids = []
    n_tasks = len(self.tasks)
    for i, entry in enumerate(shard):
      if not isinstance(entry, tuple):
        entry = (entry,)
      if len(entry) > 4:
        raise ValueError(
            "Entry is malformed and must be of length 1-4 containing featurization_input and optionally label, weight, and id."
        )
      if len(entry) == 4:
        featurization_input, label, weight, entry_id = entry
      elif len(entry) == 3:
        featurization_input, label, weight = entry
        entry_id = global_index + i
      elif len(entry) == 2:
        featurization_input, label = entry
        weight = np.ones((n_tasks), np.float32)
        entry_id = global_index + i
      elif len(entry) == 1:
        featurization_input = entry
        label = np.zeros((n_tasks), np.float32)
        weight = np.zeros((n_tasks), np.float32)
        entry_id = global_index + i
      feature = self.featurizer(featurization_input)
      features.append(feature)
      weights.append(weight)
      labels.append(label)
      ids.append(entry_id)
    X = np.concatenate(features, axis=0)
    y = np.array(labels)
    w = np.array(weights)
    ids = np.array(ids)
    return X, y, w, ids
+20 −8
Original line number Diff line number Diff line
@@ -888,7 +888,7 @@ class NumpyDataset(Dataset):
        for i in order:
          yield (self._X[i], self._y[i], self._w[i], self._ids[i])

    class TorchDataset(torch.utils.data.IterableDataset):
    class TorchDataset(torch.utils.data.IterableDataset):  # type: ignore

      def __iter__(self):
        return iterate()
@@ -1090,15 +1090,14 @@ class DiskDataset(Dataset):
    Gets learning tasks associated with this dataset.
    """
    return self.tasks
    # if not len(self.metadata_df):
    #  raise ValueError("No data in dataset.")
    # return next(self.metadata_df.iterrows())[1]['task_names']

  def reshard(self, shard_size: int) -> None:
    """Reshards data to have specified shard size."""
    # Create temp directory to store resharded version
    reshard_dir = tempfile.mkdtemp()

    n_shards = self.get_number_shards()

    # Write data in new shards
    def generator():
      tasks = self.get_task_names()
@@ -1106,7 +1105,8 @@ class DiskDataset(Dataset):
      y_next = np.zeros((0,) + (len(tasks),))
      w_next = np.zeros((0,) + (len(tasks),))
      ids_next = np.zeros((0,), dtype=object)
      for (X, y, w, ids) in self.itershards():
      for shard_num, (X, y, w, ids) in enumerate(self.itershards()):
        logger.info("Resharding shard %d/%d" % (shard_num, n_shards))
        X_next = np.concatenate([X_next, X], axis=0)
        y_next = np.concatenate([y_next, y], axis=0)
        w_next = np.concatenate([w_next, w], axis=0)
@@ -1366,8 +1366,11 @@ class DiskDataset(Dataset):
      out_dir = tempfile.mkdtemp()
    tasks = self.get_task_names()

    n_shards = self.get_number_shards()

    def generator():
      for shard_num, row in self.metadata_df.iterrows():
        logger.info("Transforming shard %d/%d" % (shard_num, n_shards))
        X, y, w, ids = self.get_shard(shard_num)
        newx, newy, neww = fn(X, y, w)
        yield (newx, newy, neww, ids)
@@ -1409,7 +1412,7 @@ class DiskDataset(Dataset):
          for i in range(X.shape[0]):
            yield (X[i], y[i], w[i], ids[i])

    class TorchDataset(torch.utils.data.IterableDataset):
    class TorchDataset(torch.utils.data.IterableDataset):  # type: ignore

      def __iter__(self):
        return iterate()
@@ -1485,6 +1488,7 @@ class DiskDataset(Dataset):

    def generator():
      for ind, dataset in enumerate(datasets):
        logger.info("Merging in dataset %d/%d" % (ind, len(datasets)))
        X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
        yield (X, y, w, ids)

@@ -1761,9 +1765,12 @@ class DiskDataset(Dataset):
    indices = np.array(sorted(indices)).astype(int)
    tasks = self.get_task_names()

    n_shards = self.get_number_shards()

    def generator():
      count, indices_count = 0, 0
      for shard_num, (X, y, w, ids) in enumerate(self.itershards()):
        logger.info("Selecting from shard %d/%d" % (shard_num, n_shards))
        shard_len = len(X)
        # Find indices which rest in this shard
        num_shard_elts = 0
@@ -1936,7 +1943,12 @@ class ImageDataset(Dataset):
    self._X_shape = self._find_array_shape(X)
    self._y_shape = self._find_array_shape(y)
    if w is None:
      if len(self._y_shape) == 1:
      if len(self._y_shape) == 0:
        # Case n_samples should be 1
        if n_samples != 1:
          raise ValueError("y can only be a scalar if n_samples == 1")
        w = np.ones_like(y)
      elif len(self._y_shape) == 1:
        w = np.ones(self._y_shape[0], np.float32)
      else:
        w = np.ones((self._y_shape[0], 1), np.float32)
@@ -2164,7 +2176,7 @@ class ImageDataset(Dataset):
          yield (get_image(self._X, i), get_image(self._y, i), self._w[i],
                 self._ids[i])

    class TorchDataset(torch.utils.data.IterableDataset):
    class TorchDataset(torch.utils.data.IterableDataset):  # type: ignore

      def __iter__(self):
        return iterate()
+8 −0
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@ import tempfile
from scipy import misc
import deepchem as dc
import zipfile
import numpy as np


class TestImageLoader(unittest.TestCase):
@@ -62,6 +63,13 @@ class TestImageLoader(unittest.TestCase):
    # These are the known dimensions of face.png
    assert dataset.X.shape == (1, 768, 1024, 3)

  def test_png_simple_load_with_labels(self):
    loader = dc.data.ImageLoader()
    dataset = loader.featurize((self.face_path, np.array(1)))
    # These are the known dimensions of face.png
    assert dataset.X.shape == (1, 768, 1024, 3)
    assert (dataset.y == np.ones((1,))).all()

  def test_tif_simple_load(self):
    loader = dc.data.ImageLoader()
    dataset = loader.featurize(self.tif_image_path)
+58 −0
Original line number Diff line number Diff line
import deepchem as dc
import numpy as np


def test_inmemory_features():
  smiles = ["C", "CC", "CCC", "CCCC"]
  featurizer = dc.feat.CircularFingerprint(size=1024)
  loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
  dataset = loader.create_dataset(smiles, shard_size=2)
  assert len(dataset) == 4
  assert dataset.X.shape == (4, 1024)
  assert dataset.get_number_shards() == 2
  assert (dataset.ids == np.arange(4)).all()


def test_inmemory_features_and_labels():
  smiles = ["C", "CC", "CCC", "CCCC"]
  labels = [1, 0, 1, 0]
  featurizer = dc.feat.CircularFingerprint(size=1024)
  loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
  dataset = loader.create_dataset(zip(smiles, labels), shard_size=2)
  assert len(dataset) == 4
  assert dataset.X.shape == (4, 1024)
  assert (dataset.y == np.array(labels)).all()
  assert dataset.get_number_shards() == 2
  assert (dataset.ids == np.arange(4)).all()


def test_inmemory_features_and_labels_and_weights():
  smiles = ["C", "CC", "CCC", "CCCC"]
  labels = [1, 0, 1, 0]
  weights = [1.5, 1.5, 1, 1]
  featurizer = dc.feat.CircularFingerprint(size=1024)
  loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
  dataset = loader.create_dataset(zip(smiles, labels, weights), shard_size=2)
  assert len(dataset) == 4
  assert dataset.X.shape == (4, 1024)
  assert (dataset.y == np.array(labels)).all()
  assert (dataset.w == np.array(weights)).all()
  assert (dataset.ids == np.arange(4)).all()
  assert dataset.get_number_shards() == 2


def test_inmemory_features_and_labels_and_weights_and_ids():
  smiles = ["C", "CC", "CCC", "CCCC"]
  labels = [1, 0, 1, 0]
  weights = [1.5, 1.5, 1, 1]
  ids = smiles
  featurizer = dc.feat.CircularFingerprint(size=1024)
  loader = dc.data.InMemoryLoader(tasks=["task1"], featurizer=featurizer)
  dataset = loader.create_dataset(
      zip(smiles, labels, weights, ids), shard_size=2)
  assert len(dataset) == 4
  assert dataset.X.shape == (4, 1024)
  assert (dataset.y == np.array(labels)).all()
  assert (dataset.w == np.array(weights)).all()
  assert (dataset.ids == np.array(ids)).all()
  assert dataset.get_number_shards() == 2
Loading