Commit 96c21502 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

next round of edits

parent e7db067b
Loading
Loading
Loading
Loading
+15 −0
Original line number Diff line number Diff line
@@ -241,6 +241,21 @@ class Dataset(object):
    raw_data = (ids, X, y, w)
    return Dataset(data_dir=data_dir, tasks=tasks, raw_data=raw_data)

  @staticmethod
  def merge(merge_dir, datasets):
    """Merges provided datasets into a merged dataset."""
    Xs, ys, ws, all_ids = [], [], [], []
    for dataset in datasets:
      X, y, w, ids = dataset.to_numpy()
      Xs.append(X)
      ys.append(y)
      ws.append(w)
      all_ids.append(ids)
    tasks = dataset.get_task_names()
    X, y, w, ids = (
        np.vstack(Xs), np.vstack(ys), np.vstack(ws), np.concatenate(all_ids))
    return Dataset.from_numpy(merge_dir, X, y, w, ids, tasks)

  def select(self, select_dir, indices):
    """Creates a new dataset from a selection of indices from self."""
    indices = np.array(indices).astype(int)
+51 −0
Original line number Diff line number Diff line
"""
Testing singletask/multitask dataset merging
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "GPL"

import os
import shutil
import tempfile
import numpy as np
from deepchem.models.tests import TestAPI
from deepchem.utils.save import load_from_disk
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.datasets import Dataset

class TestMerge(TestAPI):
  """
  Test singletask/multitask dataset merging.
  """
  def test_move_load(self):
    """Test that datasets can be moved and loaded."""
    verbosity = "high"
    current_dir = os.path.dirname(os.path.realpath(__file__))
    first_data_dir = os.path.join(self.base_dir, "first_dataset")
    second_data_dir = os.path.join(self.base_dir, "second_dataset")
    merged_data_dir = os.path.join(self.base_dir, "merged_data")

    dataset_file = os.path.join(
        current_dir, "../../models/tests/example.csv")

    featurizers = [CircularFingerprint(size=1024)]
    tasks = ["log-solubility"]
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field="smiles",
                                featurizers=featurizers,
                                verbosity=verbosity)
    first_dataset = featurizer.featurize(
        dataset_file, first_data_dir)
    second_dataset = featurizer.featurize(
        dataset_file, second_data_dir)

    merged_dataset = Dataset.merge(
        merged_data_dir, [first_dataset, second_dataset])

    assert len(merged_dataset) == len(first_dataset) + len(second_dataset)
+22 −7
Original line number Diff line number Diff line
@@ -55,10 +55,24 @@ class SingletaskToMultitask(Model):
      w_task = w[:, ind]
      X_task = X[w_task != 0, :]
      y_task = y_task[w_task != 0]
      task_model = self.model_builder([task], {task: self.task_types[task]}, self.model_params,
      task_model = self.model_builder(
          [task], {task: self.task_types[task]}, self.model_params,
          self.task_model_dirs[task],
          verbosity=self.verbosity)
      #################################### DEBUG
      if y_task.size > 0:
      #################################### DEBUG
        task_model.raw_model.fit(X_task, y_task)
      #################################### DEBUG
      else:
        print("No labels for task %s" % task)
        print("Fitting on dummy dataset.")
        X_task_fake = np.zeros_like(X)
        y_task_fake = np.zeros_like(w_task)
        print("X.shape, y.shape, w.shape, y_task.shape, w_task.shape, y_task_fake.shape")
        print(X.shape, y.shape, w.shape, y_task.shape, w_task.shape, y_task_fake.shape)
        task_model.raw_model.fit(X_task_fake, y_task_fake)
      #################################### DEBUG
      task_model.save()

  def predict_on_batch(self, X):
@@ -70,7 +84,8 @@ class SingletaskToMultitask(Model):
    y_pred = np.zeros((n_samples, n_tasks))
    for ind, task in enumerate(self.tasks):
      task_type = self.task_types[task]
      task_model = self.model_builder([task], {task: self.task_types[task]}, self.model_params,
      task_model = self.model_builder(
          [task], {task: self.task_types[task]}, self.model_params,
          self.task_model_dirs[task],
          verbosity=self.verbosity)
      task_model.reload()
+0 −5
Original line number Diff line number Diff line
@@ -42,11 +42,6 @@ class SklearnModel(Model):
    Fits SKLearn model to data.
    """
    X, y, w, _ = dataset.to_numpy()
    ########################################### DEBUG
    print("SklearnModel.fit()")
    print("X.shape, y.shape, w.shape")
    print(X.shape, y.shape, w.shape)
    ########################################### DEBUG
    y, w = np.squeeze(y), np.squeeze(w)
    # Logistic regression doesn't support weights
    if not isinstance(self.raw_model, LogisticRegression):