Unverified Commit ab097fde authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1844 from deepchem/dataset_print

Pretty Printing Datasets
parents a8d420da 63dafb95
Loading
Loading
Loading
Loading
+394 −122

File changed.

Preview size limit exceeded, changes collapsed.

+26 −3
Original line number Diff line number Diff line
@@ -794,6 +794,29 @@ class TestDatasets(test_util.TensorFlowTestCase):
    np.testing.assert_array_equal(
        np.stack([dataset.y[:, 0], dataset.X[:, 0]], axis=1), dataset3.w)


if __name__ == "__main__":
  unittest.main()
  def test_to_str(self):
    """Tests to string representation of Dataset."""
    dataset = dc.data.NumpyDataset(
        X=np.random.rand(5, 3), y=np.random.rand(5,), ids=np.arange(5))
    ref_str = '<NumpyDataset X.shape: (5, 3), y.shape: (5,), w.shape: (5,), ids: [0 1 2 3 4], task_names: [0]>'
    assert str(dataset) == ref_str

    # Test id shrinkage
    dc.utils.set_print_threshold(10)
    dataset = dc.data.NumpyDataset(
        X=np.random.rand(50, 3), y=np.random.rand(50,), ids=np.arange(50))
    ref_str = '<NumpyDataset X.shape: (50, 3), y.shape: (50,), w.shape: (50,), ids: [0 1 2 ... 47 48 49], task_names: [0]>'
    assert str(dataset) == ref_str

    # Test task shrinkage
    dataset = dc.data.NumpyDataset(
        X=np.random.rand(50, 3), y=np.random.rand(50, 20), ids=np.arange(50))
    ref_str = '<NumpyDataset X.shape: (50, 3), y.shape: (50, 20), w.shape: (50, 1), ids: [0 1 2 ... 47 48 49], task_names: [ 0  1  2 ... 17 18 19]>'
    assert str(dataset) == ref_str

    # Test max print size
    dc.utils.set_max_print_size(25)
    dataset = dc.data.NumpyDataset(
        X=np.random.rand(50, 3), y=np.random.rand(50,), ids=np.arange(50))
    ref_str = '<NumpyDataset X.shape: (50, 3), y.shape: (50,), w.shape: (50,), task_names: [0]>'
    assert str(dataset) == ref_str
+4 −8
Original line number Diff line number Diff line
@@ -60,11 +60,9 @@ class CoulombMatrix(Featurizer):
  >>> featurizers = dc.feat.CoulombMatrix(max_atoms=23)
  >>> input_file = 'deepchem/feat/tests/data/water.sdf' # really backed by water.sdf.csv
  >>> tasks = ["atomization_energy"]
  >>> featurizer = dc.data.SDFLoader(tasks, featurizer=featurizers)
  >>> dataset = featurizer.featurize(input_file) #doctest: +ELLIPSIS
  >>> loader = dc.data.SDFLoader(tasks, featurizer=featurizers)
  >>> dataset = loader.create_dataset(input_file) #doctest: +ELLIPSIS
  Reading structures from deepchem/feat/tests/data/water.sdf.
  ...
  Loading dataset from disk.
  """
  conformers = True
  name = 'coulomb_matrix'
@@ -221,11 +219,9 @@ class CoulombMatrixEig(CoulombMatrix):
  >>> featurizers = dc.feat.CoulombMatrixEig(max_atoms=23)
  >>> input_file = 'deepchem/feat/tests/data/water.sdf' # really backed by water.sdf.csv
  >>> tasks = ["atomization_energy"]
  >>> featurizer = dc.data.SDFLoader(tasks, featurizer=featurizers)
  >>> dataset = featurizer.featurize(input_file) #doctest: +ELLIPSIS
  >>> loader = dc.data.SDFLoader(tasks, featurizer=featurizers)
  >>> dataset = loader.create_dataset(input_file) #doctest: +ELLIPSIS
  Reading structures from deepchem/feat/tests/data/water.sdf.
  ...
  Loading dataset from disk.
  """

  conformers = True
+14 −15
Original line number Diff line number Diff line
@@ -6,11 +6,13 @@ import sklearn
import tempfile
import numpy as np
import shutil
from deepchem.utils.save import log
import logging
from deepchem.models import Model
from deepchem.data import DiskDataset
from deepchem.trans import undo_transforms

logger = logging.getLogger(__name__)


class SingletaskToMultitask(Model):
  """
@@ -19,18 +21,17 @@ class SingletaskToMultitask(Model):
  Warning: This current implementation is only functional for sklearn models.
  """

  def __init__(self, tasks, model_builder, model_dir=None, verbose=True):
    super(SingletaskToMultitask, self).__init__(
        self, model_dir=model_dir, verbose=verbose)
  def __init__(self, tasks, model_builder, model_dir=None):
    super(SingletaskToMultitask, self).__init__(self, model_dir=model_dir)
    self.tasks = tasks
    self.task_model_dirs = {}
    self.model_builder = model_builder
    log("About to initialize singletask to multitask model", self.verbose)
    logger.info("About to initialize singletask to multitask model")
    for task in self.tasks:
      task_model_dir = os.path.join(self.model_dir, str(task))
      if not os.path.exists(task_model_dir):
        os.makedirs(task_model_dir)
      log("Initializing directory for task %s" % task, self.verbose)
      logger.info("Initializing directory for task %s" % task)
      self.task_model_dirs[task] = task_model_dir

  def _create_task_datasets(self, dataset):
@@ -44,10 +45,8 @@ class SingletaskToMultitask(Model):
      task_data_dirs.append(task_data_dir)
    task_datasets = self._to_singletask(dataset, task_data_dirs)
    for task, task_dataset in zip(self.tasks, task_datasets):
      log(
          "Dataset for task %s has shape %s" % (task,
                                                str(task_dataset.get_shape())),
          self.verbose)
      logger.info("Dataset for task %s has shape %s" %
                  (task, str(task_dataset.get_shape())))
    return task_datasets

  @staticmethod
@@ -55,17 +54,17 @@ class SingletaskToMultitask(Model):
    """Transforms a multitask dataset to a collection of singletask datasets."""
    tasks = dataset.get_task_names()
    assert len(tasks) == len(task_dirs)
    log("Splitting multitask dataset into singletask datasets", dataset.verbose)
    logger.info("Splitting multitask dataset into singletask datasets")
    task_datasets = [
        DiskDataset.create_dataset([], task_dirs[task_num], [task])
        for (task_num, task) in enumerate(tasks)
    ]
    #task_metadata_rows = {task: [] for task in tasks}
    for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()):
      log("Processing shard %d" % shard_num, dataset.verbose)
      logger.info("Processing shard %d" % shard_num)
      basename = "dataset-%d" % shard_num
      for task_num, task in enumerate(tasks):
        log("\tTask %s" % task, dataset.verbose)
        logger.info("\tTask %s" % task)
        if len(w.shape) == 1:
          w_task = w
        elif w.shape[1] == 1:
@@ -94,10 +93,10 @@ class SingletaskToMultitask(Model):
    """
    if not isinstance(dataset, DiskDataset):
      raise ValueError('SingletaskToMultitask only works with DiskDatasets')
    log("About to create task-specific datasets", self.verbose)
    logger.info("About to create task-specific datasets")
    task_datasets = self._create_task_datasets(dataset)
    for ind, task in enumerate(self.tasks):
      log("Fitting model for task %s" % task, self.verbose)
      logger.info("Fitting model for task %s" % task)
      task_model = self.model_builder(self.task_model_dirs[task])
      task_model.fit(task_datasets[ind], **kwargs)
      task_model.save()
+1 −1
Original line number Diff line number Diff line
@@ -430,7 +430,7 @@ class SingletaskStratifiedSplitter(Splitter):
    >>> X = np.random.rand(n_samples, n_features)
    >>> y = np.random.rand(n_samples, n_tasks)
    >>> w = np.ones_like(y)
    >>> dataset = DiskDataset.from_numpy(np.ones((100,n_tasks)), np.ones((100,n_tasks)), verbose=False)
    >>> dataset = DiskDataset.from_numpy(np.ones((100,n_tasks)), np.ones((100,n_tasks)))
    >>> splitter = SingletaskStratifiedSplitter(task_number=5, verbose=False)
    >>> train_dataset, test_dataset = splitter.train_test_split(dataset)

Loading