Commit 74fb2716 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

fixes

parent 19029da8
Loading
Loading
Loading
Loading
+7 −7
Original line number Diff line number Diff line
@@ -423,7 +423,7 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_to_numpy(self):
    """Test that transformation to numpy arrays is sensible."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    data_shape = solubility_dataset.get_data_shape()
    tasks = solubility_dataset.get_task_names()
    X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
@@ -438,7 +438,7 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_consistent_ordering(self):
    """Test that ordering of labels is consistent over time."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()

    ids1 = solubility_dataset.ids
    ids2 = solubility_dataset.ids
@@ -447,7 +447,7 @@ class TestDatasets(test_util.TensorFlowTestCase):

  def test_get_statistics(self):
    """Test statistics computation of this dataset."""
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
                  solubility_dataset.w, solubility_dataset.ids)
    X_means, y_means = np.mean(X, axis=0), np.mean(y, axis=0)
@@ -460,7 +460,7 @@ class TestDatasets(test_util.TensorFlowTestCase):
    np.testing.assert_allclose(comp_y_stds, y_stds)

  def test_disk_iterate_batch_size(self):
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
                  solubility_dataset.w, solubility_dataset.ids)
    batch_sizes = []
@@ -685,7 +685,7 @@ class TestDatasets(test_util.TensorFlowTestCase):
          np.sort(all_ids, axis=0), np.sort(test_ids, axis=0))

  def test_numpy_iterate_batch_size(self):
    solubility_dataset = dc.data.tests.load_solubility_data()
    solubility_dataset = load_solubility_data()
    X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
                  solubility_dataset.w, solubility_dataset.ids)
    solubility_dataset = dc.data.NumpyDataset.from_DiskDataset(
@@ -798,12 +798,12 @@ class TestDatasets(test_util.TensorFlowTestCase):
  @unittest.skipIf(PYTORCH_IMPORT_FAILED, 'PyTorch is not installed')
  def test_make_pytorch_dataset_from_disk(self):
    """Test creating a PyTorch Dataset from a DiskDataset."""
    dataset = dc.data.tests.load_solubility_data()
    dataset = load_solubility_data()
    self._validate_pytorch_dataset(dataset)

  def test_dataframe(self):
    """Test converting between Datasets and DataFrames."""
    dataset = dc.data.tests.load_solubility_data()
    dataset = load_solubility_data()

    # A round trip from Dataset to DataFrame to Dataset should produce identical arrays.

+58 −74
Original line number Diff line number Diff line
@@ -13,12 +13,15 @@ import pandas as pd
import itertools
import os
import deepchem as dc
import logging
from deepchem.data import DiskDataset
from deepchem.utils import ScaffoldGenerator
from deepchem.utils.save import log
from deepchem.data import NumpyDataset
from deepchem.utils.save import load_data

logger = logging.getLogger(__name__)


def generate_scaffold(smiles, include_chirality=False):
  """Compute the Bemis-Murcko scaffold for a SMILES string."""
@@ -44,10 +47,6 @@ class Splitter(object):
    Abstract base class for chemically aware splits..
    """

  def __init__(self, verbose=False):
    """Creates splitter object."""
    self.verbose = verbose

  def k_fold_split(self, dataset, k, directories=None, **kwargs):
    """
    Parameters
@@ -75,7 +74,7 @@ class Splitter(object):
    :param kwargs:
    :return: list of length k tuples of (train, cv)
    """
    log("Computing K-fold split", self.verbose)
    logger.info("Computing K-fold split")
    if directories is None:
      directories = [tempfile.mkdtemp() for _ in range(2 * k)]
    else:
@@ -125,14 +124,12 @@ class Splitter(object):
                             frac_test=.1,
                             seed=None,
                             log_every_n=1000,
                             verbose=True,
                             **kwargs):
    """
        Splits self into train/validation/test sets.
    """ Splits self into train/validation/test sets.

    Returns Dataset objects.
    """
    log("Computing train/valid/test indices", self.verbose)
    logger.info("Computing train/valid/test indices")
    train_inds, valid_inds, test_inds = self.split(
        dataset,
        seed=seed,
@@ -164,10 +161,8 @@ class Splitter(object):
                       test_dir=None,
                       seed=None,
                       frac_train=.8,
                       verbose=True,
                       **kwargs):
    """
        Splits self into train/test sets.
    """Splits self into train/test sets.
    Returns Dataset objects.
    """
    valid_dir = tempfile.mkdtemp()
@@ -180,7 +175,6 @@ class Splitter(object):
        frac_test=1 - frac_train,
        frac_valid=0.,
        seed=seed,
        verbose=verbose,
        **kwargs)
    return train_dataset, test_dataset

@@ -191,7 +185,6 @@ class Splitter(object):
            frac_valid=None,
            frac_test=None,
            log_every_n=None,
            verbose=False,
            **kwargs):
    """
    Stub to be filled in by child classes.
@@ -400,7 +393,7 @@ class RandomStratifiedSplitter(Splitter):

  def k_fold_split(self, dataset, k, directories=None, **kwargs):
    """Needs custom implementation due to ragged splits for stratification."""
    log("Computing K-fold split", self.verbose)
    logger.info("Computing K-fold split")
    if directories is None:
      directories = [tempfile.mkdtemp() for _ in range(k)]
    else:
@@ -433,12 +426,12 @@ class SingletaskStratifiedSplitter(Splitter):
    >>> y = np.random.rand(n_samples, n_tasks)
    >>> w = np.ones_like(y)
    >>> dataset = DiskDataset.from_numpy(np.ones((100,n_tasks)), np.ones((100,n_tasks)))
    >>> splitter = SingletaskStratifiedSplitter(task_number=5, verbose=False)
    >>> splitter = SingletaskStratifiedSplitter(task_number=5)
    >>> train_dataset, test_dataset = splitter.train_test_split(dataset)

    """

  def __init__(self, task_number=0, verbose=False):
  def __init__(self, task_number=0):
    """
    Creates splitter object.

@@ -446,11 +439,8 @@ class SingletaskStratifiedSplitter(Splitter):
    ----------
    task_number: int (Optional, Default 0)
      Task number for stratification.
        verbose: bool (Optional, Default False)
          Controls logging frequency.
    """
    self.task_number = task_number
    self.verbose = verbose

  def k_fold_split(self,
                   dataset,
@@ -479,7 +469,7 @@ class SingletaskStratifiedSplitter(Splitter):
    fold_datasets: List
      List containing dc.data.Dataset objects
    """
    log("Computing K-fold split", self.verbose)
    logger.info("Computing K-fold split")
    if directories is None:
      directories = [tempfile.mkdtemp() for _ in range(k)]
    else:
@@ -731,7 +721,7 @@ class IndiceSplitter(Splitter):
    Class for splits based on input order.
    """

  def __init__(self, verbose=False, valid_indices=None, test_indices=None):
  def __init__(self, valid_indices=None, test_indices=None):
    """
    Parameters
    -----------
@@ -740,7 +730,6 @@ class IndiceSplitter(Splitter):
    test_indices: list of int
        indices of samples in the test set
    """
    self.verbose = verbose
    self.valid_indices = valid_indices
    self.test_indices = test_indices

@@ -866,7 +855,7 @@ class ScaffoldSplitter(Splitter):
    valid_cutoff = (frac_train + frac_valid) * len(dataset)
    train_inds, valid_inds, test_inds = [], [], []

    log("About to sort in scaffold sets", self.verbose)
    logger.info("About to sort in scaffold sets")
    for scaffold_set in scaffold_sets:
      if len(train_inds) + len(scaffold_set) > train_cutoff:
        if len(train_inds) + len(valid_inds) + len(scaffold_set) > valid_cutoff:
@@ -884,10 +873,10 @@ class ScaffoldSplitter(Splitter):
    scaffolds = {}
    data_len = len(dataset)

    log("About to generate scaffolds", self.verbose)
    logger.info("About to generate scaffolds")
    for ind, smiles in enumerate(dataset.ids):
      if ind % log_every_n == 0:
        log("Generating scaffold %d/%d" % (ind, data_len), self.verbose)
        logger.info("Generating scaffold %d/%d" % (ind, data_len))
      scaffold = generate_scaffold(smiles)
      if scaffold not in scaffolds:
        scaffolds[scaffold] = [ind]
@@ -995,11 +984,10 @@ class SpecifiedSplitter(Splitter):
  Class that splits data according to user specification.
  """

  def __init__(self, input_file, split_field, verbose=False):
  def __init__(self, input_file, split_field):
    """Provide input information for splits."""
    raw_df = next(load_data([input_file], shard_size=None))
    self.splits = raw_df[split_field].values
    self.verbose = verbose

  def split(self,
            dataset,
@@ -1030,13 +1018,11 @@ class SpecifiedIndexSplitter(Splitter):
  Class that splits data according to user index specification
  """

  def __init__(self, train_inds, valid_inds, test_inds, verbose=False):
  def __init__(self, train_inds, valid_inds, test_inds):
    """Provide input information for splits."""
    self.train_inds = train_inds
    self.valid_inds = valid_inds
    self.test_inds = test_inds
    self.verbose = verbose
    super(SpecifiedIndexSplitter, self).__init__(verbose)

  def split(self,
            dataset,
@@ -1044,8 +1030,7 @@ class SpecifiedIndexSplitter(Splitter):
            frac_train=.8,
            frac_valid=.1,
            frac_test=.1,
            log_every_n=1000,
            verbose=False):
            log_every_n=1000):
    """
    Splits internal compounds into train/validation/test by user-specification.
    """
@@ -1054,10 +1039,9 @@ class SpecifiedIndexSplitter(Splitter):

class TimeSplitterPDBbind(Splitter):

  def __init__(self, ids, year_file=None, verbose=False):
  def __init__(self, ids, year_file=None):
    self.ids = ids
    self.year_file = year_file
    self.verbose = verbose

  def split(self,
            dataset,
+1 −0
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@ __author__ = "Bharath Ramsundar, Aneesh Pappu"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import os
import tempfile
import unittest
import numpy as np
+3 −1
Original line number Diff line number Diff line
@@ -3,7 +3,7 @@
Contains an abstract base class that supports data transformations.
"""
import os

import logging
import numpy as np
import scipy
import scipy.ndimage
@@ -12,6 +12,8 @@ import deepchem as dc
import tensorflow as tf
from deepchem.data import NumpyDataset

logger = logging.getLogger(__name__)


def undo_transforms(y, transformers):
  """Undoes all transformations applied."""