Commit 0af7a051 authored by Ubuntu's avatar Ubuntu
Browse files

yapfed lots of stuff

parent c94913e0
Loading
Loading
Loading
Loading
+40 −32
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@ import time
import numpy as np
from deepchem.data import NumpyDataset


def remove_dead_examples(dataset):
  """Removes compounds with no weight.

@@ -28,6 +29,7 @@ def remove_dead_examples(dataset):

  return NumpyDataset(X, y, w, ids)


def dataset_difference(dataset, remove):
  """Removes the compounds in remove from dataset.

@@ -39,8 +41,9 @@ def dataset_difference(dataset, remove):
    Dataset whose overlap will be removed.
  """
  remove_ids = set(remove.ids)
  keep_inds = [ind for ind in range(len(dataset))
               if dataset.ids[ind] not in remove_ids]
  keep_inds = [
      ind for ind in range(len(dataset)) if dataset.ids[ind] not in remove_ids
  ]

  # Remove support indices
  X = dataset.X[keep_inds]
@@ -50,6 +53,7 @@ def dataset_difference(dataset, remove):

  return NumpyDataset(X, y, w, ids)


def get_task_dataset_minus_support(dataset, support, task):
  """Gets data for specified task, minus support points.

@@ -66,8 +70,9 @@ def get_task_dataset_minus_support(dataset, support, task):
    Task number of task to select.
  """
  support_ids = set(support.ids)
  non_support_inds = [ind for ind in range(len(dataset))
                      if dataset.ids[ind] not in support_ids]
  non_support_inds = [
      ind for ind in range(len(dataset)) if dataset.ids[ind] not in support_ids
  ]

  # Remove support indices
  X = dataset.X[non_support_inds]
@@ -85,6 +90,7 @@ def get_task_dataset_minus_support(dataset, support, task):

  return NumpyDataset(X_task, y_task, w_task, ids_task)


def get_task_dataset(dataset, task):
  """Selects out entries for a particular task."""
  X, y, w, ids = dataset.X, dataset.y, dataset.w, dataset.ids
@@ -98,6 +104,7 @@ def get_task_dataset(dataset, task):

  return NumpyDataset(X_task, y_task, w_task, ids_task)


def get_task_test(dataset, n_episodes, n_test, task, log_every_n=50):
  """Gets test set from specified task.

@@ -135,6 +142,7 @@ def get_task_test(dataset, n_episodes, n_test, task, log_every_n=50):
    tests.append(NumpyDataset(X_batch, y_batch, w_batch, ids_batch))
  return tests


def get_single_task_test(dataset, batch_size, task, replace=True):
  """Gets test set from specified task.

@@ -156,7 +164,6 @@ def get_single_task_test(dataset, batch_size, task, replace=True):
  return NumpyDataset(X_batch, y_batch, w_batch, ids_batch)



def get_single_task_support(dataset, n_pos, n_neg, task, replace=True):
  """Generates one support set purely for specified task.
  
@@ -180,6 +187,7 @@ def get_single_task_support(dataset, n_pos, n_neg, task, replace=True):
  """
  return get_task_support(dataset, 1, n_pos, n_neg, task)[0]


def get_task_support(dataset, n_episodes, n_pos, n_neg, task, log_every_n=50):
  """Generates one support set purely for specified task.
  
@@ -221,26 +229,23 @@ def get_task_support(dataset, n_episodes, n_pos, n_neg, task, log_every_n=50):
    # Handle one-d vs. non one-d feature matrices
    one_dimensional_features = (len(dataset.X.shape) == 1)
    if not one_dimensional_features:
      X = np.vstack(
          [dataset.X[pos_inds], dataset.X[neg_inds]])
      X = np.vstack([dataset.X[pos_inds], dataset.X[neg_inds]])
    else:
      X = np.concatenate(
          [dataset.X[pos_inds], dataset.X[neg_inds]])
    y = np.concatenate(
        [dataset.y[pos_inds, task], dataset.y[neg_inds, task]])
    w = np.concatenate(
        [dataset.w[pos_inds, task], dataset.w[neg_inds, task]])
    ids = np.concatenate(
        [dataset.ids[pos_inds], dataset.ids[neg_inds]])
      X = np.concatenate([dataset.X[pos_inds], dataset.X[neg_inds]])
    y = np.concatenate([dataset.y[pos_inds, task], dataset.y[neg_inds, task]])
    w = np.concatenate([dataset.w[pos_inds, task], dataset.w[neg_inds, task]])
    ids = np.concatenate([dataset.ids[pos_inds], dataset.ids[neg_inds]])
    supports.append(NumpyDataset(X, y, w, ids))
  return supports


class EpisodeGenerator(object):
  """Generates (support, test) pairs for episodic training.

  Precomputes all (support, test) pairs at construction. Allows to reduce
  overhead from computation.
  """

  def __init__(self, dataset, n_pos, n_neg, n_test, n_episodes_per_task):
    """
    Parameters
@@ -268,10 +273,10 @@ class EpisodeGenerator(object):
    self.task_episodes = {}

    for task in range(self.n_tasks):
      task_supports = get_task_support(
          self.dataset, n_episodes_per_task, n_pos, n_neg, task)
      task_tests = get_task_test(
          self.dataset, n_episodes_per_task, n_test, task)
      task_supports = get_task_support(self.dataset, n_episodes_per_task, n_pos,
                                       n_neg, task)
      task_tests = get_task_test(self.dataset, n_episodes_per_task, n_test,
                                 task)
      self.task_episodes[task] = (task_supports, task_tests)

    # Init the iterator
@@ -280,8 +285,8 @@ class EpisodeGenerator(object):
    self.task_num = 0
    self.trial_num = 0
    time_end = time.time()
    print("Constructing EpisodeGenerator took %s seconds"
          % str(time_end-time_start))
    print("Constructing EpisodeGenerator took %s seconds" %
          str(time_end - time_start))

  def __iter__(self):
    return self
@@ -317,6 +322,7 @@ class SupportGenerator(object):
  Iterates over tasks and trials. For each trial, picks one support from
  each task, and returns in a randomized order
  """

  def __init__(self, dataset, n_pos, n_neg, n_trials):
    """
    Parameters
@@ -360,7 +366,10 @@ class SupportGenerator(object):
      task = self.perm_tasks[self.task_num]  # Get id from permutation
      #support = self.supports[task][self.trial_num]
      support = get_single_task_support(
          self.dataset, n_pos=self.n_pos, n_neg=self.n_neg, task=task,
          self.dataset,
          n_pos=self.n_pos,
          n_neg=self.n_neg,
          task=task,
          replace=False)
      # Increment and update logic
      self.task_num += 1
@@ -372,4 +381,3 @@ class SupportGenerator(object):
      return (task, support)

  __next__ = next  # Python 3.X compatibility
+2 −2
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@ import deepchem as dc
import numpy as np
from sklearn.ensemble import RandomForestClassifier


class TestDrop(unittest.TestCase):
  """
  Test how loading of malformed compounds is handled.
@@ -21,8 +22,7 @@ class TestDrop(unittest.TestCase):

    current_dir = os.path.dirname(os.path.realpath(__file__))
    print("About to load emols dataset.")
    dataset_file = os.path.join(
        current_dir, "mini_emols.csv")
    dataset_file = os.path.join(current_dir, "mini_emols.csv")

    # Featurize emols dataset
    print("About to featurize datasets.")
+8 −8
Original line number Diff line number Diff line
@@ -74,13 +74,13 @@ class TestReload(unittest.TestCase):
    dataset_file = os.path.join(current_dir,
                                "../../../datasets/mini_muv.csv.gz")
    print("Running experiment for first time without reload.")
    (len_train, len_valid, len_test) = self._run_muv_experiment(dataset_file,
                                                                reload)
    (len_train, len_valid, len_test) = self._run_muv_experiment(
        dataset_file, reload)

    print("Running experiment for second time with reload.")
    reload = True
    (len_reload_train, len_reload_valid, len_reload_test) = (
        self._run_muv_experiment(dataset_file, reload))
    (len_reload_train, len_reload_valid,
     len_reload_test) = (self._run_muv_experiment(dataset_file, reload))
    assert len_train == len_reload_train
    assert len_valid == len_reload_valid
    assert len_test == len_reload_valid
@@ -92,12 +92,12 @@ class TestReload(unittest.TestCase):
    dataset_file = os.path.join(current_dir,
                                "../../../datasets/mini_muv.csv.gz")
    print("Running experiment for first time with reload.")
    (len_train, len_valid, len_test) = self._run_muv_experiment(dataset_file,
                                                                reload)
    (len_train, len_valid, len_test) = self._run_muv_experiment(
        dataset_file, reload)

    print("Running experiment for second time with reload.")
    (len_reload_train, len_reload_valid, len_reload_test) = (
        self._run_muv_experiment(dataset_file, reload))
    (len_reload_train, len_reload_valid,
     len_reload_test) = (self._run_muv_experiment(dataset_file, reload))
    assert len_train == len_reload_train
    assert len_valid == len_reload_valid
    assert len_test == len_reload_valid
+2 −2
Original line number Diff line number Diff line
@@ -372,8 +372,8 @@ class TestSupports(unittest.TestCase):
                                                 n_trials)

    for ind, (task, support) in enumerate(support_generator):
      task_dataset = dc.data.get_task_dataset_minus_support(dataset, support,
                                                            task)
      task_dataset = dc.data.get_task_dataset_minus_support(
          dataset, support, task)

      task_y = dataset.y[:, task]
      task_w = dataset.w[:, task]
+2 −2
Original line number Diff line number Diff line
@@ -43,8 +43,8 @@ class MetricsTest(googletest.TestCase):
  def test_one_hot(self):
    y = np.array([0, 0, 1, 0, 1, 1, 0])
    y_hot = metrics.to_one_hot(y)
    expected = np.array(
        [[1, 0], [1, 0], [0, 1], [1, 0], [0, 1], [0, 1], [1, 0]])
    expected = np.array([[1, 0], [1, 0], [0, 1], [1, 0], [0, 1], [0, 1], [1,
                                                                          0]])
    yp = metrics.from_one_hot(y_hot)
    assert np.array_equal(expected, y_hot)
    assert np.array_equal(y, yp)
Loading