Commit 2efccdc6 authored by leswing's avatar leswing
Browse files

yapf

parent c17ba162
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -18,7 +18,7 @@ import shutil

__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "GPL"
__license__ = "MIT"


def sparsify_features(X):
+35 −23
Original line number Diff line number Diff line
@@ -7,7 +7,7 @@ from __future__ import unicode_literals

__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "GPL"
__license__ = "MIT"

import unittest
import tempfile
@@ -16,6 +16,7 @@ import shutil
import numpy as np
import deepchem as dc


def load_solubility_data():
  """Loads solubility dataset"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -28,39 +29,45 @@ def load_solubility_data():

  return loader.featurize(input_file)


def load_butina_data():
  """Loads solubility dataset"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["task"]
  # task_type = "regression"
  input_file = os.path.join(current_dir, "../../models/tests/butina_example.csv")
  input_file = os.path.join(current_dir,
                            "../../models/tests/butina_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)

  return loader.featurize(input_file)


def load_multitask_data():
  """Load example multitask data."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
           "task7", "task8", "task9", "task10", "task11", "task12",
           "task13", "task14", "task15", "task16"]
  input_file = os.path.join(
      current_dir, "../../models/tests/multitask_example.csv")
  tasks = [
      "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7",
      "task8", "task9", "task10", "task11", "task12", "task13", "task14",
      "task15", "task16"
  ]
  input_file = os.path.join(current_dir,
                            "../../models/tests/multitask_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


def load_classification_data():
  """Loads classification data from example.csv"""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["outcome"]
  task_type = "classification"
  input_file = os.path.join(
      current_dir, "../../models/tests/example_classification.csv")
  input_file = os.path.join(current_dir,
                            "../../models/tests/example_classification.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)
@@ -70,26 +77,30 @@ def load_sparse_multitask_dataset():
  """Load sparse tox multitask data, sample dataset."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
  tasks = ["task1", "task2", "task3", "task4", "task5", "task6",
           "task7", "task8", "task9"]
  input_file = os.path.join(
      current_dir, "../../models/tests/sparse_multitask_example.csv")
  tasks = [
      "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8",
      "task9"
  ]
  input_file = os.path.join(current_dir,
                            "../../models/tests/sparse_multitask_example.csv")
  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  return loader.featurize(input_file)


def load_feat_multitask_data():
  """Load example with numerical features, tasks."""
  current_dir = os.path.dirname(os.path.abspath(__file__))
  features = ["feat0", "feat1", "feat2", "feat3", "feat4", "feat5"]
  featurizer = dc.feat.UserDefinedFeaturizer(features)
  tasks = ["task0", "task1", "task2", "task3", "task4", "task5"]
  input_file = os.path.join(
      current_dir, "../../models/tests/feat_multitask_example.csv")
  input_file = os.path.join(current_dir,
                            "../../models/tests/feat_multitask_example.csv")
  loader = dc.data.UserCSVLoader(
      tasks=tasks, featurizer=featurizer, id_field="id")
  return loader.featurize(input_file)


def load_gaussian_cdf_data():
  """Load example with numbers sampled from Gaussian normal distribution.
     Each feature and task is a column of values that is sampled
@@ -98,12 +109,13 @@ def load_gaussian_cdf_data():
  features = ["feat0", "feat1"]
  featurizer = dc.feat.UserDefinedFeaturizer(features)
  tasks = ["task0", "task1"]
  input_file = os.path.join(
      current_dir, "../../models/tests/gaussian_cdf_example.csv")
  input_file = os.path.join(current_dir,
                            "../../models/tests/gaussian_cdf_example.csv")
  loader = dc.data.UserCSVLoader(
      tasks=tasks, featurizer=featurizer, id_field="id")
  return loader.featurize(input_file)


def load_unlabelled_data():
  current_dir = os.path.dirname(os.path.abspath(__file__))
  featurizer = dc.feat.CircularFingerprint(size=1024)
+26 −30
Original line number Diff line number Diff line
@@ -7,7 +7,7 @@ from __future__ import unicode_literals

__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "GPL"
__license__ = "MIT"

import os
import unittest
@@ -15,21 +15,22 @@ import tempfile
import shutil
import deepchem as dc


class TestDataLoader(unittest.TestCase):
  """
  Test DataLoader 
  """

  def setUp(self):
    super(TestDataLoader, self).setUp()
    self.current_dir = os.path.dirname(os.path.abspath(__file__))

  def unlabelled_test(self):
    input_file = os.path.join(
        self.current_dir, "../../data/tests/no_labels.csv")
    input_file = os.path.join(self.current_dir,
                              "../../data/tests/no_labels.csv")
    featurizer = dc.feat.CircularFingerprint(size=1024)
    loader = dc.data.CSVLoader(
        tasks=[], smiles_field="smiles",
        featurizer=featurizer)
        tasks=[], smiles_field="smiles", featurizer=featurizer)
    loader.featurize(input_file)

  def scaffold_test_train_valid_test_split(self):
@@ -41,14 +42,13 @@ class TestDataLoader(unittest.TestCase):
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(
        self.current_dir, "../../models/tests/example.csv")
    input_file = os.path.join(self.current_dir,
                              "../../models/tests/example.csv")
    featurizer = dc.feat.CircularFingerprint(size=1024)

    input_file = os.path.join(self.current_dir, input_file)
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles",
        featurizer=featurizer)
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)

    dataset = loader.featurize(input_file)

@@ -69,14 +69,13 @@ class TestDataLoader(unittest.TestCase):
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(
        self.current_dir, "../../models/tests/example.csv")
    input_file = os.path.join(self.current_dir,
                              "../../models/tests/example.csv")
    featurizer = dc.feat.CircularFingerprint(size=1024)

    input_file = os.path.join(self.current_dir, input_file)
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles",
        featurizer=featurizer)
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)

    dataset = loader.featurize(input_file)

@@ -94,14 +93,13 @@ class TestDataLoader(unittest.TestCase):
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(
        self.current_dir, "../../models/tests/example.csv")
    input_file = os.path.join(self.current_dir,
                              "../../models/tests/example.csv")
    featurizer = dc.feat.CircularFingerprint(size=1024)

    input_file = os.path.join(self.current_dir, input_file)
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles",
        featurizer=featurizer)
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)

    dataset = loader.featurize(input_file)

@@ -120,12 +118,11 @@ class TestDataLoader(unittest.TestCase):
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(
        self.current_dir, "../../models/tests/example.csv")
    input_file = os.path.join(self.current_dir,
                              "../../models/tests/example.csv")
    featurizer = dc.feat.CircularFingerprint(size=1024)
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles",
        featurizer=featurizer)
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)

    dataset = loader.featurize(input_file)

@@ -144,7 +141,8 @@ class TestDataLoader(unittest.TestCase):
    tasks = ["log-solubility"]
    smiles_field = "smiles"
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles",
        tasks=tasks,
        smiles_field="smiles",
        featurizer=dc.feat.CircularFingerprint(size=1024))
    dataset = loader.featurize(input_file)

@@ -155,16 +153,14 @@ class TestDataLoader(unittest.TestCase):
    base_dir = tempfile.mkdtemp()
    data_dir = os.path.join(base_dir, "data")
    moved_data_dir = os.path.join(base_dir, "moved_data")
    dataset_file = os.path.join(
        self.current_dir, "../../models/tests/example.csv")
    dataset_file = os.path.join(self.current_dir,
                                "../../models/tests/example.csv")

    featurizer = dc.feat.CircularFingerprint(size=1024)
    tasks = ["log-solubility"]
    loader = dc.data.CSVLoader(
        tasks=tasks, smiles_field="smiles",
        featurizer=featurizer)
    featurized_dataset = loader.featurize(
        dataset_file, data_dir)
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    featurized_dataset = loader.featurize(dataset_file, data_dir)
    n_dataset = len(featurized_dataset)

    # Now perform move
+51 −48
Original line number Diff line number Diff line
@@ -7,7 +7,7 @@ from __future__ import unicode_literals

__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "GPL"
__license__ = "MIT"

import unittest
import tempfile
@@ -16,6 +16,7 @@ import shutil
import numpy as np
import deepchem as dc


class TestDatasets(unittest.TestCase):
  """
  Test basic top-level API for dataset objects.
@@ -90,7 +91,6 @@ class TestDatasets(unittest.TestCase):
    X_out = dc.data.pad_features(batch_size, X_b)
    assert len(X_out) == batch_size


  def test_pad_batches(self):
    """Test that pad_batch pads batches correctly."""
    batch_size = 100
@@ -104,8 +104,8 @@ class TestDatasets(unittest.TestCase):
    w_b = np.zeros((n_samples, num_tasks))
    ids_b = np.zeros((n_samples,))

    X_out, y_out, w_out, ids_out = dc.data.pad_batch(
        batch_size, X_b, y_b, w_b, ids_b)
    X_out, y_out, w_out, ids_out = dc.data.pad_batch(batch_size, X_b, y_b, w_b,
                                                     ids_b)
    assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size

    # Test cases where n_samples < batch_size
@@ -115,8 +115,8 @@ class TestDatasets(unittest.TestCase):
    w_b = np.zeros((n_samples, num_tasks))
    ids_b = np.zeros((n_samples,))

    X_out, y_out, w_out, ids_out = dc.data.pad_batch(
        batch_size, X_b, y_b, w_b, ids_b)
    X_out, y_out, w_out, ids_out = dc.data.pad_batch(batch_size, X_b, y_b, w_b,
                                                     ids_b)
    assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size

    # Test case where n_samples == batch_size
@@ -126,8 +126,8 @@ class TestDatasets(unittest.TestCase):
    w_b = np.zeros((n_samples, num_tasks))
    ids_b = np.zeros((n_samples,))

    X_out, y_out, w_out, ids_out = dc.data.pad_batch(
        batch_size, X_b, y_b, w_b, ids_b)
    X_out, y_out, w_out, ids_out = dc.data.pad_batch(batch_size, X_b, y_b, w_b,
                                                     ids_b)
    assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size

    # Test case for object featurization.
@@ -136,8 +136,8 @@ class TestDatasets(unittest.TestCase):
    y_b = np.zeros((n_samples, num_tasks))
    w_b = np.zeros((n_samples, num_tasks))
    ids_b = np.zeros((n_samples,))
    X_out, y_out, w_out, ids_out = dc.data.pad_batch(
        batch_size, X_b, y_b, w_b, ids_b)
    X_out, y_out, w_out, ids_out = dc.data.pad_batch(batch_size, X_b, y_b, w_b,
                                                     ids_b)
    assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size

    # Test case for more complicated object featurization
@@ -146,8 +146,8 @@ class TestDatasets(unittest.TestCase):
    y_b = np.zeros((n_samples, num_tasks))
    w_b = np.zeros((n_samples, num_tasks))
    ids_b = np.zeros((n_samples,))
    X_out, y_out, w_out, ids_out = dc.data.pad_batch(
        batch_size, X_b, y_b, w_b, ids_b)
    X_out, y_out, w_out, ids_out = dc.data.pad_batch(batch_size, X_b, y_b, w_b,
                                                     ids_b)
    assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size

    # Test case with multidimensional data
@@ -159,8 +159,8 @@ class TestDatasets(unittest.TestCase):
    w_b = np.zeros((n_samples, num_tasks))
    ids_b = np.zeros((n_samples,))

    X_out, y_out, w_out, ids_out = dc.data.pad_batch(
        batch_size, X_b, y_b, w_b, ids_b)
    X_out, y_out, w_out, ids_out = dc.data.pad_batch(batch_size, X_b, y_b, w_b,
                                                     ids_b)
    assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size

  def test_get_task_names(self):
@@ -169,10 +169,11 @@ class TestDatasets(unittest.TestCase):
    assert solubility_dataset.get_task_names() == ["log-solubility"]

    multitask_dataset = dc.data.tests.load_multitask_data()
    assert sorted(multitask_dataset.get_task_names()) == sorted(["task0",
        "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8",
        "task9", "task10", "task11", "task12", "task13", "task14", "task15",
        "task16"])
    assert sorted(multitask_dataset.get_task_names()) == sorted([
        "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7",
        "task8", "task9", "task10", "task11", "task12", "task13", "task14",
        "task15", "task16"
    ])

  def test_get_data_shape(self):
    """Test that get_data_shape returns currect data shape"""
@@ -313,6 +314,7 @@ class TestDatasets(unittest.TestCase):

    def fn(x, y, w):
      return (2 * x, 1.5 * y, w)

    transformed = dataset.transform(fn)
    np.testing.assert_array_equal(X, dataset.X)
    np.testing.assert_array_equal(y, dataset.y)
@@ -334,6 +336,7 @@ class TestDatasets(unittest.TestCase):
    # Transform it
    def fn(x, y, w):
      return (2 * x, 1.5 * y, w)

    transformed = dataset.transform(fn)
    np.testing.assert_array_equal(X, dataset.X)
    np.testing.assert_array_equal(y, dataset.y)
+12 −13
Original line number Diff line number Diff line
@@ -7,7 +7,7 @@ from __future__ import unicode_literals

__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "GPL"
__license__ = "MIT"

import os
import shutil
@@ -16,6 +16,7 @@ import tempfile
import deepchem as dc
import numpy as np


class TestLoad(unittest.TestCase):
  """
  Test singletask/multitask data loading.
@@ -27,8 +28,7 @@ class TestLoad(unittest.TestCase):
    base_dir = tempfile.mkdtemp()
    data_dir = os.path.join(base_dir, "data")
    moved_data_dir = os.path.join(base_dir, "moved_data")
    dataset_file = os.path.join(
        current_dir, "../../models/tests/example.csv")
    dataset_file = os.path.join(current_dir, "../../models/tests/example.csv")

    featurizer = dc.feat.CircularFingerprint(size=1024)
    tasks = ["log-solubility"]
@@ -61,8 +61,8 @@ class TestLoad(unittest.TestCase):

    # Load dataset
    print("About to load dataset.")
    dataset_file = os.path.join(
        current_dir, "../../models/tests/multitask_example.csv")
    dataset_file = os.path.join(current_dir,
                                "../../models/tests/multitask_example.csv")

    # Featurize tox21 dataset
    print("About to featurize dataset.")
@@ -78,7 +78,6 @@ class TestLoad(unittest.TestCase):
    X_multi, y_multi, w_multi, ids_multi = (dataset.X, dataset.y, dataset.w,
                                            dataset.ids)


    ####### Do singletask load
    y_tasks, w_tasks, = [], []
    dataset = dc.data.DiskDataset(data_dir)
@@ -115,8 +114,8 @@ class TestLoad(unittest.TestCase):

    # Load dataset
    print("About to load dataset.")
    dataset_file = os.path.join(
        current_dir, "../../models/tests/multitask_example.csv")
    dataset_file = os.path.join(current_dir,
                                "../../models/tests/multitask_example.csv")

    # Featurize tox21 dataset
    print("About to featurize dataset.")
Loading