Commit 0e57beeb authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Added select feature to dataset.

parent e5916873
Loading
Loading
Loading
Loading
+16 −10
Original line number Diff line number Diff line
@@ -18,10 +18,6 @@ __author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "GPL"

# TODO(rbharath): The semantics of this class are very difficult to debug.
# Multiple transformations of the data are performed on disk, and computations
# of mean/std are spread across multiple functions for efficiency. Some
# refactoring needs to happen here.
class Dataset(object):
  """
  Wrapper class for dataset transformed into X, y, w numpy ndarrays.
@@ -140,7 +136,6 @@ class Dataset(object):
            out_X_sums, out_X_sum_squares, out_X_n,
            out_y_sums, out_y_sum_squares, out_y_n]


  def save_to_disk(self):
    """Save dataset to disk."""
    save_to_disk(
@@ -270,7 +265,6 @@ class Dataset(object):
                   metadata_rows=metadata_rows,
                   verbosity=self.verbosity)


  def shuffle(self, iterations=1):
    """Shuffles this dataset on disk to have random order."""
    #np.random.seed(9452)
@@ -332,11 +326,23 @@ class Dataset(object):
  def select(self, select_dir, indices):
    """Creates a new dataset from a selection of indices from self."""
    indices = np.array(indices).astype(int)
    X, y, w, ids = self.to_numpy()
    count = 0
    metadata_rows = []
    tasks = self.get_task_names()
    X_sel, y_sel, w_sel, ids_sel = (
        X[indices], y[indices], w[indices], ids[indices])
    return Dataset.from_numpy(select_dir, X_sel, y_sel, w_sel, ids_sel, tasks)
    for shard_num, (X, y, w, ids) in enumerate(self.itershards()):
      shard_len = len(X)
      X_sel = X[indices[count:count+shard_len]]
      y_sel = y[indices[count:count+shard_len]]
      w_sel = w[indices[count:count+shard_len]]
      ids_sel = ids[indices[count:count+shard_len]]
      basename = "dataset-%d" % shard_num
      metadata_rows.append(
          Dataset.write_data_to_disk(select_dir, basename, tasks,
                                     X_sel, y_sel, w_sel, ids_sel))
      count += shard_len
    return Dataset(data_dir=select_dir,
                   metadata_rows=metadata_rows,
                   verbosity=self.verbosity)
    
  def to_numpy(self):
    """
+1 −0
Original line number Diff line number Diff line
@@ -76,6 +76,7 @@ def load_nci(base_dir, reload=True, force_transform=False):
    dataset = Dataset(data_dir, reload=True)

  # Initialize transformers
  transformers = []
  if regen or force_transform:
    print("About to transform data")
    transformers = [
+22 −0
Original line number Diff line number Diff line
@@ -48,6 +48,28 @@ class TestBasicDatasetAPI(TestDatasetAPI):
    solubility_dataset = self.load_solubility_data()
    assert len(solubility_dataset) == 10

  def test_select(self):
    """Test that dataset select works."""
    num_datapoints = 10
    num_features = 10
    num_tasks = 1
    X = np.random.rand(num_datapoints, num_features)
    y = np.random.randint(2, size=(num_datapoints, num_tasks))
    w = np.ones((num_datapoints, num_tasks))
    ids = np.array(["id"] * num_datapoints)
    dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids)

    select_dir = tempfile.mkdtemp()
    indices = [0, 4, 5, 8]
    select_dataset = dataset.select(select_dir, indices)
    X_sel, y_sel, w_sel, ids_sel = select_dataset.to_numpy()
    np.testing.assert_array_equal(X[indices], X_sel)
    np.testing.assert_array_equal(y[indices], y_sel)
    np.testing.assert_array_equal(w[indices], w_sel)
    np.testing.assert_array_equal(ids[indices], ids_sel)
    shutil.rmtree(select_dir)
    
  
  def test_iterbatches(self):
    """Test that iterating over batches of data works."""
    solubility_dataset = self.load_solubility_data()
+21 −4
Original line number Diff line number Diff line
@@ -47,6 +47,9 @@ class Splitter(object):
        dataset,
        frac_train=frac_train, frac_test=frac_test,
        frac_valid=frac_valid, log_every_n=log_every_n)
    ########################################################### DEBUG
    print("Computed indices successfully!")
    ########################################################### DEBUG
    train_dataset = dataset.select(train_dir, train_inds)
    if valid_dir is not None:
      valid_dataset = dataset.select(valid_dir, valid_inds)
@@ -117,9 +120,23 @@ class RandomSplitter(Splitter):
    """
    np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
    np.random.seed(seed)
    train_cutoff = frac_train * len(dataset)
    valid_cutoff = (frac_train+frac_valid) * len(dataset)
    shuffled = np.random.permutation(range(len(dataset)))
    ########################################################### DEBUG
    print("About to compute len!")
    ########################################################### DEBUG
    num_datapoints = len(dataset)
    train_cutoff = int(frac_train * num_datapoints)
    ########################################################### DEBUG
    print("Successfully computed len!")
    ########################################################### DEBUG
    valid_cutoff = int((frac_train+frac_valid) * num_datapoints )
    ########################################################### DEBUG
    print("num_datapoints, train_cutoff, valid_cutoff")
    print(num_datapoints, train_cutoff, valid_cutoff)
    ########################################################### DEBUG
    shuffled = np.random.permutation(range(num_datapoints))
    ########################################################### DEBUG
    print("Successfully computed shuffled.")
    ########################################################### DEBUG
    return (shuffled[:train_cutoff], shuffled[train_cutoff:valid_cutoff],
            shuffled[valid_cutoff:])

@@ -137,7 +154,7 @@ class ScaffoldSplitter(Splitter):
    log("About to generate scaffolds", self.verbosity)
    data_len = len(dataset)
    for ind, smiles in enumerate(dataset.get_ids()):
      if self.verbosity is not None and ind % log_every_n == 0:
      if ind % log_every_n == 0:
        log("Generating scaffold %d/%d" % (ind, data_len), self.verbosity)
      scaffold = generate_scaffold(smiles)
      if scaffold not in scaffolds:
+13 −21
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ from deepchem.metrics import Metric
from deepchem.models.sklearn_models import SklearnModel
from deepchem.utils.evaluate import Evaluator
from deepchem.datasets.nci_datasets import load_nci
from deepchem.splits import RandomSplitter

np.random.seed(123)

@@ -29,7 +30,7 @@ force_transform = False
base_data_dir = "/scratch/users/rbharath/nci_data_dir"
base_dir = "/scratch/users/rbharath/nci_analysis_dir"

nci_tasks, dataset, transformers = load_nci(
nci_tasks, nci_dataset, transformers = load_nci(
    base_data_dir, reload=reload, force_transform=force_transform)

if os.path.exists(base_dir):
@@ -42,26 +43,17 @@ test_dir = os.path.join(base_dir, "test_dataset")
model_dir = os.path.join(base_dir, "model")

print("About to perform train/valid/test split.")
num_train = .8 * len(dataset)
X, y, w, ids = dataset.to_numpy()
num_tasks = 17
nci_tasks = nci_tasks[:num_tasks]
print("Using following tasks")
print(nci_tasks)
X_train, X_valid = X[:num_train], X[num_train:]
y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks]
w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
ids_train, ids_valid = ids[:num_train], ids[num_train:]

if os.path.exists(train_dir):
  shutil.rmtree(train_dir)
train_dataset = Dataset.from_numpy(train_dir, X_train, y_train,
                                   w_train, ids_train, nci_tasks)

if os.path.exists(valid_dir):
  shutil.rmtree(valid_dir)
valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid,
                                   w_valid, ids_valid, nci_tasks)
splitter = RandomSplitter(verbosity=verbosity)
if (reload and
    os.path.exists(train_dir) and
    os.path.exists(valid_dir) and
    os.path.exists(test_dir)):
  train_dataset = Dataset(train_dir, reload=True)
  valid_dataset = Dataset(valid_dir, reload=True)
  test_dataset = Dataset(test_dir, reload=True)
else:
  train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
      nci_dataset, train_dir, valid_dir, test_dir)

# Fit Logistic Regression models
nci_task_types = {task: "classification" for task in nci_tasks}