Commit 99896246 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #204 from rbharath/grabbag

Better conversion of multitask to singletask datasets
parents e8eb8b82 6b55bb5e
Loading
Loading
Loading
Loading
+59 −4
Original line number Diff line number Diff line
@@ -34,8 +34,6 @@ class Dataset(object):
    self.verbosity = verbosity

    if not reload or not os.path.exists(self._get_metadata_filename()):
      log("About to start initializing dataset", self.verbosity)

      if metadata_rows is not None:
        self.metadata_df = Dataset.construct_metadata(metadata_rows)
        self.save_to_disk()
@@ -214,7 +212,7 @@ class Dataset(object):
        yield (X_batch, y_batch, w_batch, ids_batch)

  @staticmethod
  def from_numpy(data_dir, X, y, w=None, ids=None, tasks=None):
  def from_numpy(data_dir, X, y, w=None, ids=None, tasks=None, verbosity=None):
    n_samples = len(X)
    # The -1 indicates that y will be reshaped to have length -1
    if n_samples > 0:
@@ -229,7 +227,8 @@ class Dataset(object):
    if tasks is None:
      tasks = np.arange(n_tasks)
    raw_data = (ids, X, y, w)
    return Dataset(data_dir=data_dir, tasks=tasks, raw_data=raw_data)
    return Dataset(data_dir=data_dir, tasks=tasks, raw_data=raw_data,
                   verbosity=verbosity)

  @staticmethod
  def merge(merge_dir, datasets):
@@ -323,6 +322,10 @@ class Dataset(object):
        os.path.join(self.data_dir, row['ids'])), dtype=object)
    return (X, y, w, ids)

  def set_verbosity(self, new_verbosity):
    """Sets verbosity."""
    self.verbosity = new_verbosity

  def select(self, select_dir, indices):
    """Creates a new dataset from a selection of indices from self."""
    if not os.path.exists(select_dir):
@@ -361,6 +364,38 @@ class Dataset(object):
                   metadata_rows=metadata_rows,
                   verbosity=self.verbosity)

  def to_singletask(self, task_dirs):
    """Transforms multitask dataset in collection of singletask datasets."""
    tasks = self.get_task_names()
    assert len(tasks) == len(task_dirs)
    task_metadata_rows = {task: [] for task in tasks}
    for shard_num, (X, y, w, ids) in enumerate(self.itershards()):
      log("Processing shard %d" % shard_num, self.verbosity)
      basename = "dataset-%d" % shard_num
      for task_num, task in enumerate(tasks):
        log("\tTask %s" % task, self.verbosity)
        w_task = w[:, task_num]
        y_task = y[:, task_num]

        # Extract those datapoints which are present for this task
        X_nonzero = X[w_task != 0]
        num_datapoints = X_nonzero.shape[0]
        y_nonzero = np.reshape(y_task[w_task != 0], (num_datapoints, 1))
        w_nonzero = np.reshape(w_task[w_task != 0], (num_datapoints, 1))
        ids_nonzero = ids[w_task != 0]

        task_metadata_rows[task].append(
          Dataset.write_data_to_disk(
              task_dirs[task_num], basename, [task],
              X_nonzero, y_nonzero, w_nonzero, ids_nonzero))
    
    task_datasets = [
        Dataset(data_dir=task_dirs[task_num],
                metadata_rows=task_metadata_rows[task],
                verbosity=self.verbosity)
        for (task_num, task) in enumerate(tasks)]
    return task_datasets
    
  def to_numpy(self):
    """
    Transforms internal data into arrays X, y, w
@@ -415,6 +450,26 @@ class Dataset(object):
      total += len(y)
    return total

  def get_shape(self):
    """Finds shape of dataset."""
    n_tasks = len(self.get_task_names())
    X_shape = np.array((0,) + (0,) * len(self.get_data_shape())) 
    y_shape = np.array((0,) + (0,))
    w_shape = np.array((0,) + (0,))
    ids_shape = np.array((0,))
    for shard_num, (X, y, w, ids) in enumerate(self.itershards()):
      if shard_num == 0:
        X_shape += np.array(X.shape)
        y_shape += np.array(y.shape)
        w_shape += np.array(w.shape)
        ids_shape += np.array(ids.shape)
      else:
        X_shape[0] += np.array(X.shape)[0]
        y_shape[0] += np.array(y.shape)[0]
        w_shape[0] += np.array(w.shape)[0]
        ids_shape[0] += np.array(ids.shape)[0]
    return tuple(X_shape), tuple(y_shape), tuple(w_shape), tuple(ids_shape)

  def get_label_means(self):
    """Return pandas series of label means."""
    return self.metadata_df["y_means"]
+58 −0
Original line number Diff line number Diff line
@@ -69,6 +69,64 @@ class TestBasicDatasetAPI(TestDatasetAPI):
    np.testing.assert_array_equal(ids[indices], ids_sel)
    shutil.rmtree(select_dir)

  def test_get_shape(self):
    """Test that get_shape works."""
    num_datapoints = 100
    num_features = 10
    num_tasks = 10
    # Generate data
    X = np.random.rand(num_datapoints, num_features)
    y = np.random.randint(2, size=(num_datapoints, num_tasks))
    w = np.random.randint(2, size=(num_datapoints, num_tasks))
    ids = np.array(["id"] * num_datapoints)
    
    dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids, verbosity="high")

    X_shape, y_shape, w_shape, ids_shape = dataset.get_shape()
    print("type(X_shape), type(y_shape), type(w_shape), type(ids_shape)")
    print(type(X_shape), type(y_shape), type(w_shape), type(ids_shape))
    print("type(X.shape), type(y.shape), type(w.shape), type(ids.shape)")
    print(type(X.shape), type(y.shape), type(w.shape), type(ids.shape))
    print("X_shape, y_shape, w_shape, ids_shape")
    print(X_shape, y_shape, w_shape, ids_shape)
    print("X.shape, y.shape, w.shape, ids.shape")
    print(X.shape, y.shape, w.shape, ids.shape)
    assert X_shape == X.shape
    assert y_shape == y.shape
    assert w_shape == w.shape
    assert ids_shape == ids.shape


  def test_to_singletask(self):
    """Test that to_singletask works."""
    num_datapoints = 100
    num_features = 10
    num_tasks = 10
    # Generate data
    X = np.random.rand(num_datapoints, num_features)
    y = np.random.randint(2, size=(num_datapoints, num_tasks))
    w = np.random.randint(2, size=(num_datapoints, num_tasks))
    ids = np.array(["id"] * num_datapoints)
    
    dataset = Dataset.from_numpy(self.data_dir, X, y, w, ids, verbosity="high")

    task_dirs = []
    try:
      for task in range(num_tasks):
        task_dirs.append(tempfile.mkdtemp())
      singletask_datasets = dataset.to_singletask(task_dirs)
      for task in range(num_tasks):
        singletask_dataset = singletask_datasets[task]
        X_task, y_task, w_task, ids_task = singletask_dataset.to_numpy()
        w_nonzero = w[:, task] != 0
        np.testing.assert_array_equal(X_task, X[w_nonzero != 0])
        np.testing.assert_array_equal(y_task.flatten(), y[:, task][w_nonzero != 0])
        np.testing.assert_array_equal(w_task.flatten(), w[:, task][w_nonzero != 0])
        np.testing.assert_array_equal(ids_task, ids[w_nonzero != 0])
    finally:
      # Cleanup
      for task_dir in task_dirs:
        shutil.rmtree(task_dir)
  
  def test_iterbatches(self):
    """Test that iterating over batches of data works."""
+21 −7
Original line number Diff line number Diff line
@@ -9,7 +9,6 @@ import os
import numpy as np
from deepchem.utils.save import log
from deepchem.models import Model
# DEBUG
import sklearn

class SingletaskToMultitask(Model):
@@ -42,25 +41,40 @@ class SingletaskToMultitask(Model):
          self.verbosity, "high")
      self.task_model_dirs[task] = task_model_dir

  
  def _create_task_datasets(self, dataset):
    """Make directories to hold data for tasks"""
    task_data_dirs = []
    for task in self.tasks:
      task_data_dir = os.path.join(self.model_dir, str(task) + "_data")
      if os.path.exists(task_data_dir):
        shutil.rmtree(task_data_dir)
      os.makedirs(task_data_dir)
      task_data_dirs.append(task_data_dir)
    task_datasets = dataset.to_singletask(task_data_dirs)
    if self.verbosity is not None:
      for task, task_dataset in zip(self.tasks, task_datasets):
        log("Dataset for task %s has shape %s"
            % (task, str(task_dataset.get_shape())), self.verbosity)
    return task_datasets
   
      
  def fit(self, dataset):
    """
    Updates all singletask models with new information.

    Warning: This current implementation is only functional for sklearn models. 
    """
    X, y, w, _ = dataset.to_numpy()
    task_datasets = self._create_task_datasets(dataset)
    for ind, task in enumerate(self.tasks):
      log("Fitting model for task %s" % task, self.verbosity, "high")
      y_task = y[:, ind]
      w_task = w[:, ind]
      X_task = X[w_task != 0, :]
      y_task = y_task[w_task != 0]
      X_task, y_task, w_task, ids_task = task_datasets[ind].to_numpy()
      task_model = self.model_builder(
          [task], {task: self.task_types[task]}, self.model_params,
          self.task_model_dirs[task],
          verbosity=self.verbosity)
      if y_task.size > 0:
        task_model.raw_model.fit(X_task, y_task)
        task_model.raw_model.fit(X_task, np.ravel(y_task))
      else:
        print("No labels for task %s" % task)
        print("Fitting on dummy dataset.")
+0 −2
Original line number Diff line number Diff line
@@ -11,8 +11,6 @@ from deepchem.datasets import Dataset
from deepchem.transformers import NormalizationTransformer
from deepchem.transformers import ClippingTransformer
from deepchem.hyperparameters import HyperparamOpt
from sklearn.ensemble import RandomForestClassifier
from deepchem.models.sklearn_models import SklearnModel
from bace_features import user_specified_features
from deepchem import metrics
from deepchem.metrics import Metric
+4 −5
Original line number Diff line number Diff line
@@ -47,8 +47,6 @@ num_train = .8 * len(dataset)
X, y, w, ids = dataset.to_numpy()
num_tasks = 17
muv_tasks = muv_tasks[:num_tasks]
print("Using following tasks")
print(muv_tasks)
X_train, X_valid = X[:num_train], X[num_train:]
y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks]
w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
@@ -57,12 +55,14 @@ ids_train, ids_valid = ids[:num_train], ids[num_train:]
if os.path.exists(train_dir):
  shutil.rmtree(train_dir)
train_dataset = Dataset.from_numpy(train_dir, X_train, y_train,
                                   w_train, ids_train, muv_tasks)
                                   w_train, ids_train, muv_tasks,
                                   verbosity=verbosity)

if os.path.exists(valid_dir):
  shutil.rmtree(valid_dir)
valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid,
                                   w_valid, ids_valid, muv_tasks)
                                   w_valid, ids_valid, muv_tasks,
                                   verbosity=verbosity)

# Fit Logistic Regression models
muv_task_types = {task: "classification" for task in muv_tasks}
@@ -81,7 +81,6 @@ if os.path.exists(model_dir):
os.makedirs(model_dir)
def model_builder(tasks, task_types, model_params, model_dir, verbosity=None):
  return SklearnModel(tasks, task_types, model_params, model_dir,
                      #model_instance=LogisticRegression(class_weight="balanced"),
                      model_instance=RandomForestClassifier(
                          class_weight="balanced",
                          n_estimators=500),
Loading