Commit 3a2687cf authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #238 from rbharath/k_fold_splits

K-fold Random Stratified Splits
parents 431b143b 997c598e
Loading
Loading
Loading
Loading
+0 −3
Original line number Diff line number Diff line
@@ -107,8 +107,6 @@ def pad_batch(batch_size, X_b, y_b, w_b, ids_b):
      start += increment
    return (X_out, y_out, w_out, ids_out)

      

class Dataset(object):
  """
  Wrapper class for dataset transformed into X, y, w numpy ndarrays.
@@ -625,7 +623,6 @@ class Dataset(object):
    metadata_rows = []
    tasks = self.get_task_names()
    for shard_num, (X, y, w, ids) in enumerate(self.itershards()):
      log("Selecting from shard %d" % shard_num, self.verbosity)
      shard_len = len(X)
      # Find indices which rest in this shard
      num_shard_elts = 0
+83 −60
Original line number Diff line number Diff line
@@ -28,9 +28,10 @@ def randomize_arrays(array_list):
  # assumes that every array is of the same dimension
  num_rows = array_list[0].shape[0]
  perm = np.random.permutation(num_rows)
  permuted_arrays = []
  for array in array_list:
    array = array[perm]
  return array_list
    permuted_arrays.append(array[perm])
  return permuted_arrays 

class Splitter(object):
  """
@@ -103,13 +104,13 @@ class Splitter(object):
    Splits self into train/test sets.
    Returns Dataset objects.
    """
    valid_dir = None
    valid_dir = tempfile.mkdtemp()
    train_samples, _, test_samples = self.train_valid_test_split(
      samples, train_dir, valid_dir, test_dir,
      frac_train=frac_train, frac_test=1-frac_train, frac_valid=0.)
    return train_samples, test_samples

  def split(self, samples, frac_train=None, frac_valid=None, frac_test=None,
  def split(self, dataset, frac_train=None, frac_valid=None, frac_test=None,
            log_every_n=None):
    """
    Stub to be filled in by child classes.
@@ -117,10 +118,9 @@ class Splitter(object):
    raise NotImplementedError

  

class StratifiedSplitter(Splitter):
class RandomStratifiedSplitter(Splitter):
  """
  Stratified Splitter class.
  RandomStratified Splitter class.

  For sparse multitask datasets, a standard split offers no guarantees that the
  splits will have any activate compounds. This class guarantees that each task
@@ -143,52 +143,63 @@ class StratifiedSplitter(Splitter):
      col_hits = int(frac_split * col_hits)
    return required_hits

  def __generate_required_index(self, w, required_hit_list):
    col_index = 0
    index_hits = []
  def get_task_split_indices(self, y, w, frac_split):
    """Returns num datapoints needed per task to split properly."""
    w_present = (w != 0)
    y_present = y * w_present

    # Compute number of actives needed per task.
    task_actives = np.sum(y_present, axis=0)
    task_split_actives = (frac_split*task_actives).astype(int)
    
    # loop through each column and obtain index required to splice out for
    # required fraction of hits
    for col in w.T:
      num_hit = 0
      num_required = required_hit_list[col_index]
      for index, value in enumerate(col):
        if value != 0:
          num_hit += 1
          if num_hit >= num_required:
            index_hits.append(index)
            break
      col_index += 1
    return index_hits

  def __split(self, X, y, w, ids, frac_split):
    split_indices = []
    n_tasks = np.shape(y)[1]
    for task in range(n_tasks):
      actives_count = task_split_actives[task]
      cum_task_actives = np.cumsum(y_present[:, task])
      # Find the first index where the cumulative number of actives equals
      # the actives_count
      split_index = np.amin(np.where(cum_task_actives >= actives_count)[0])
      # Note that np.where tells us last index required to exceed
      # actives_count, so we actually want the following location
      split_indices.append(split_index+1)
    return split_indices 

  # TODO(rbharath): Refactor this split method to match API of other splits (or
  # potentially refactor those to match this.
  def split(self, dataset, split_dirs, frac_split):
    """
    Method that does bulk of splitting dataset.
    """
    # find the total number of hits for each task and calculate the required
    # number of hits for split based on frac_split
    required_hits_list = self.__generate_required_hits(w, frac_split)
    # finds index cutoff per task in array to get required split calculated
    index_list = self.__generate_required_index(w, required_hits_list)

    w_1 = w_2 = np.zeros(w.shape)
    assert len(split_dirs) == 2
    # Handle edge case where frac_split is 1
    if frac_split == 1:
      X, y, w, ids = dataset.to_numpy()
      dataset_1 = Dataset.from_numpy(split_dirs[0], X, y, w, ids)
      dataset_2 = None 
      return dataset_1, dataset_2
    X, y, w, ids = randomize_arrays(dataset.to_numpy())
    split_indices = self.get_task_split_indices(y, w, frac_split)

    # chunk appropriate values into weights matrices
    for col_index, index in enumerate(index_list):
    # Create weight matrices fpor two haves. 
    w_1, w_2 = np.zeros_like(w), np.zeros_like(w)
    for task, split_index in enumerate(split_indices):
      # copy over up to required index for weight first_split
      w_1[:index, col_index] = w[:index, col_index]
      w_2[index:, col_index] = w[index:, col_index]
      w_1[:split_index, task] = w[:split_index, task]
      w_2[split_index:, task] = w[split_index:, task]

    # check out if any rows in either w_1 or w_2 are just zeros
    rows_1 = w_1.any(axis=1)
    rows_2 = w_2.any(axis=1)
    X_1, y_1, w_1, ids_1 = X[rows_1], y[rows_1], w_1[rows_1], ids[rows_1]
    dataset_1 = Dataset.from_numpy(split_dirs[0], X_1, y_1, w_1, ids_1)

    # prune first set
    w_1, X_1, y_1, ids_1 = w_1[rows_1], X[rows_1], y[rows_1], ids[rows_1]

    # prune second sets
    w_2, X_2, y_2, ids_2 = w_2[rows_2], X[rows_2], y[rows_2], ids[rows_2]
    rows_2 = w_2.any(axis=1)
    X_2, y_2, w_2, ids_2 = X[rows_2], y[rows_2], w_2[rows_2], ids[rows_2]
    dataset_2 = Dataset.from_numpy(split_dirs[1], X_2, y_2, w_2, ids_2)

    return ((X_1, y_1, w_1, ids_1), (X_2, y_2, w_2, ids_2))
    return dataset_1, dataset_2 

  def train_valid_test_split(self, dataset, train_dir,
                             valid_dir, test_dir, frac_train=.8,
@@ -196,29 +207,41 @@ class StratifiedSplitter(Splitter):
                             log_every_n=1000):
    """Custom split due to raggedness in original split.
    """

    # Obtain original x, y, and w arrays and shuffle
    X, y, w, ids = randomize_arrays(dataset.to_numpy())
    train_arrays, rem_arrays = self.__split(X, y, w, ids, frac_train)
    (X_train, y_train, w_train, ids_train) = train_arrays
    (X_rem, y_rem, w_rem, ids_rem) = rem_arrays 
    rem_dir = tempfile.mkdtemp()
    train_dataset, rem_dataset = self.split(
        dataset, [train_dir, rem_dir], frac_train)

    # calculate percent split for valid (out of test and valid)
    if frac_valid + frac_test > 0:
      valid_percentage = frac_valid / (frac_valid + frac_test)
    else:
      return train_dataset, None, None
    # split test data into valid and test, treating sub test set also as sparse
    valid_arrays, test_arrays = self.__split(
        X_rem, y_rem, w_rem, ids_rem, valid_percentage)
    (X_valid, y_valid, w_valid, ids_valid) = valid_arrays
    (X_test, y_test, w_test, ids_test) = test_arrays

    # turn back into dataset objects
    train_data = Dataset.from_numpy(
        train_dir, X_train, y_train, w_train, ids_train)
    valid_data = Dataset.from_numpy(
        valid_dir, X_valid, y_valid, w_valid, ids_valid)
    test_data = Dataset.from_numpy(
        test_dir, X_test, y_test, w_test, ids_test)
    return train_data, valid_data, test_data
    valid_dataset, test_dataset = self.split(
        dataset, [valid_dir, test_dir], valid_percentage)

    return train_dataset, valid_dataset, test_dataset

  def k_fold_split(self, dataset, directories, compute_feature_statistics=True):
    """Needs custom implementation due to ragged splits for stratification."""
    log("Computing K-fold split", self.verbosity)
    k = len(directories)
    fold_datasets = []
    # rem_dataset is remaining portion of dataset
    rem_dataset = dataset
    for fold in range(k):
      # Note starts as 1/k since fold starts at 0. Ends at 1 since fold goes up
      # to k-1.
      frac_fold = 1./(k-fold)
      fold_dir = directories[fold]
      rem_dir = tempfile.mkdtemp()
      fold_dataset, rem_dataset = self.split(
          rem_dataset, [fold_dir, rem_dir], frac_split=frac_fold)
      fold_datasets.append(fold_dataset)
    return fold_datasets



class MolecularWeightSplitter(Splitter):
+191 −3
Original line number Diff line number Diff line
@@ -15,7 +15,7 @@ from deepchem.datasets import Dataset
from deepchem.splits import RandomSplitter
from deepchem.splits import IndexSplitter
from deepchem.splits import ScaffoldSplitter
from deepchem.splits import StratifiedSplitter
from deepchem.splits import RandomStratifiedSplitter
from deepchem.datasets.tests import TestDatasetAPI


@@ -180,6 +180,194 @@ class TestSplitters(TestDatasetAPI):
    assert sorted(merged_dataset.get_ids()) == (
           sorted(solubility_dataset.get_ids()))

  def test_singletask_stratified_column_indices(self):
    """
    Test RandomStratifiedSplitter's split method on simple singletas.
    """
    # Test singletask case. 
    n_samples = 100
    n_positives = 20
    n_features = 10
    n_tasks = 1

    X = np.random.rand(n_samples, n_features)
    y = np.zeros((n_samples, n_tasks))
    y[:n_positives] = 1
    w = np.ones((n_samples, n_tasks))
    ids = np.arange(n_samples)
    stratified_splitter = RandomStratifiedSplitter()
    column_indices = stratified_splitter.get_task_split_indices(
        y, w, frac_split=.5)

    split_index = column_indices[0]
    # The split index should partition dataset in half.
    assert np.count_nonzero(y[:split_index]) == 10

  def test_singletask_stratified_column_indices_mask(self):
    """
    Test RandomStratifiedSplitter's split method on dataset with mask.
    """
    # Test singletask case. 
    n_samples = 100
    n_positives = 20
    n_features = 10
    n_tasks = 1

    # Test case where some weights are zero (i.e. masked)
    X = np.random.rand(n_samples, n_features)
    y = np.zeros((n_samples, n_tasks))
    y[:n_positives] = 1
    w = np.ones((n_samples, n_tasks))
    # Set half the positives to have zero weight
    w[:n_positives/2] = 0
    ids = np.arange(n_samples)

    stratified_splitter = RandomStratifiedSplitter()
    column_indices = stratified_splitter.get_task_split_indices(
        y, w, frac_split=.5)

    split_index = column_indices[0]
    # There are 10 nonzero actives.
    # The split index should partition this into half, so expect 5
    w_present = (w != 0)
    y_present = y * w_present
    assert np.count_nonzero(y_present[:split_index]) == 5

  def test_multitask_stratified_column_indices(self):
    """
    Test RandomStratifiedSplitter split on multitask dataset.
    """
    n_samples = 100
    n_features = 10
    n_tasks = 10
    X = np.random.rand(n_samples, n_features)
    p = .05 # proportion actives
    y = np.random.binomial(1, p, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))

    stratified_splitter = RandomStratifiedSplitter()
    split_indices = stratified_splitter.get_task_split_indices(
        y, w, frac_split=.5)

    for task in range(n_tasks):
      split_index = split_indices[task]
      task_actives = np.count_nonzero(y[:, task])
      # The split index should partition dataset in half.
      assert np.count_nonzero(y[:split_index, task]) == int(task_actives/2)

  def test_multitask_stratified_column_indices_masked(self):
    """
    Test RandomStratifiedSplitter split on multitask dataset.
    """
    n_samples = 200
    n_features = 10
    n_tasks = 10
    X = np.random.rand(n_samples, n_features)
    p = .05 # proportion actives
    y = np.random.binomial(1, p, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    # Mask half the examples
    w[:n_samples/2] = 0

    stratified_splitter = RandomStratifiedSplitter()
    split_indices = stratified_splitter.get_task_split_indices(
        y, w, frac_split=.5)

    w_present = (w != 0)
    y_present = y * w_present
    for task in range(n_tasks):
      split_index = split_indices[task]
      task_actives = np.count_nonzero(y_present[:, task])
      # The split index should partition dataset in half.
      assert np.count_nonzero(y_present[:split_index, task]) == int(task_actives/2)

  def test_singletask_stratified_split(self):
    """
    Test RandomStratifiedSplitter on a singletask split.
    """
    np.random.seed(2314)
    # Test singletask case. 
    n_samples = 20
    n_positives = 10
    n_features = 10
    n_tasks = 1

    X = np.random.rand(n_samples, n_features)
    y = np.zeros((n_samples, n_tasks))
    y[:n_positives] = 1
    w = np.ones((n_samples, n_tasks))
    ids = np.arange(n_samples)
    data_dir = tempfile.mkdtemp()
    dataset = Dataset.from_numpy(data_dir, X, y, w, ids)

    stratified_splitter = RandomStratifiedSplitter()
    split_dirs = [tempfile.mkdtemp(), tempfile.mkdtemp()]
    dataset_1, dataset_2 = stratified_splitter.split(
        dataset, split_dirs, frac_split=.5)
  
    # Should have split cleanly in half (picked random seed to ensure this)
    assert len(dataset_1) == 10
    assert len(dataset_2) == 10

    # Check positives are correctly distributed
    y_1 = dataset_1.get_labels()
    assert np.count_nonzero(y_1) == n_positives/2

    y_2 = dataset_2.get_labels()
    assert np.count_nonzero(y_2) == n_positives/2

  def test_singletask_stratified_k_fold_split(self):
    """
    Test RandomStratifiedSplitter k-fold class.
    """
    n_samples = 100
    n_positives = 20
    n_features = 10
    n_tasks = 1

    X = np.random.rand(n_samples, n_features)
    y = np.zeros(n_samples)
    y[:n_positives] = 1
    w = np.ones(n_samples)
    ids = np.arange(n_samples)

    data_dir = tempfile.mkdtemp()
    dataset = Dataset.from_numpy(data_dir, X, y, w, ids)
    
    stratified_splitter = RandomStratifiedSplitter()
    ids_set = set(dataset.get_ids())

    K = 5
    fold_dirs = [tempfile.mkdtemp() for i in range(K)]
    fold_datasets = stratified_splitter.k_fold_split(
        dataset, fold_dirs)

    for fold in range(K):
      fold_dataset = fold_datasets[fold]
      # Verify lengths is 100/k == 20
      # Note: This wouldn't work for multitask str
      # assert len(fold_dataset) == n_samples/K
      fold_labels = fold_dataset.get_labels()
      # Verify that each fold has n_positives/K = 4 positive examples.
      assert np.count_nonzero(fold_labels == 1) == n_positives/K
      # Verify that compounds in this fold are subset of original compounds
      fold_ids_set = set(fold_dataset.get_ids())
      assert fold_ids_set.issubset(ids_set)
      # Verify that no two folds have overlapping compounds.
      for other_fold in range(K):
        if fold == other_fold:
          continue
        other_fold_dataset = fold_datasets[other_fold]
        other_fold_ids_set = set(other_fold_dataset.get_ids())
        assert fold_ids_set.isdisjoint(other_fold_ids_set)

    merge_dir = tempfile.mkdtemp()
    merged_dataset = Dataset.merge(merge_dir, fold_datasets)
    assert len(merged_dataset) == len(dataset)
    assert sorted(merged_dataset.get_ids()) == (
           sorted(dataset.get_ids()))


  def test_multitask_random_split(self):
    """
    Test multitask RandomSplitter class.
@@ -227,7 +415,7 @@ class TestSplitters(TestDatasetAPI):

  def test_stratified_multitask_split(self):
    """
    Test multitask StratifiedSplitter class
    Test multitask RandomStratifiedSplitter class
    """
    # sparsity is determined by number of w weights that are 0 for a given
    # task structure of w np array is such that each row corresponds to a
@@ -235,7 +423,7 @@ class TestSplitters(TestDatasetAPI):
    sparse_dataset = self.load_sparse_multitask_dataset()
    X, y, w, ids = sparse_dataset.to_numpy()
    
    stratified_splitter = StratifiedSplitter()
    stratified_splitter = RandomStratifiedSplitter()
    datasets = stratified_splitter.train_valid_test_split(
        sparse_dataset,
        self.train_dir, self.valid_dir, self.test_dir,