Commit af2cfe67 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #653 from patrickhop/master

variety of optimizations and additions
parents b148e8ad 0af7a051
Loading
Loading
Loading
Loading
+25 −0
Original line number Diff line number Diff line
@@ -80,6 +80,31 @@ def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
  return np.squeeze(np.array(features)), valid_inds


def featurize_smiles_np(arr, featurizer, log_every_N=1000, verbose=True):
  """Featurize individual compounds in a numpy array.

  Given a featurizer that operates on individual chemical compounds
  or macromolecules, compute & add features for that compound to the
  features array
  """
  features = []
  for ind, elem in enumerate(arr.tolist()):
    mol = Chem.MolFromSmiles(elem)
    if mol:
      new_order = rdmolfiles.CanonicalRankAtoms(mol)
      mol = rdmolops.RenumberAtoms(mol, new_order)
    if ind % log_every_N == 0:
      log("Featurizing sample %d" % ind, verbose)
    features.append(featurizer.featurize([mol]))

  valid_inds = np.array(
      [1 if elt.size > 0 else 0 for elt in features], dtype=bool)
  features = [elt for (is_valid, elt) in zip(valid_inds, features) if is_valid]
  features = np.squeeze(np.array(features))
  return features.reshape(
      -1,)


def get_user_specified_features(df, featurizer, verbose=True):
  """Extract and merge user specified features. 

+6 −7
Original line number Diff line number Diff line
@@ -246,7 +246,7 @@ class Dataset(object):
class NumpyDataset(Dataset):
  """A Dataset defined by in-memory numpy arrays."""

  def __init__(self, X, y=None, w=None, ids=None):
  def __init__(self, X, y=None, w=None, ids=None, n_tasks=1):
    n_samples = len(X)
    # The -1 indicates that y will be reshaped to have length -1
    if n_samples > 0:
@@ -256,9 +256,8 @@ class NumpyDataset(Dataset):
          w = np.reshape(w, (n_samples, -1))
      else:
        # Set labels to be zero, with zero weights
        y = np.zeros((n_samples, 1))
        y = np.zeros((n_samples, n_tasks))
        w = np.zeros_like(y)
    n_tasks = y.shape[1]
    if ids is None:
      ids = np.arange(n_samples)
    if w is None:
@@ -932,8 +931,8 @@ class DiskDataset(Dataset):
          if indices_count + num_shard_elts >= len(indices):
            break
        # Need to offset indices to fit within shard_size
        shard_inds = indices[indices_count:indices_count +
                             num_shard_elts] - count
        shard_inds = indices[indices_count:
                             indices_count + num_shard_elts] - count
        X_sel = X[shard_inds]
        # Handle the case of datasets with y/w missing
        if y is not None:
+40 −32
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@ import time
import numpy as np
from deepchem.data import NumpyDataset


def remove_dead_examples(dataset):
  """Removes compounds with no weight.

@@ -28,6 +29,7 @@ def remove_dead_examples(dataset):

  return NumpyDataset(X, y, w, ids)


def dataset_difference(dataset, remove):
  """Removes the compounds in remove from dataset.

@@ -39,8 +41,9 @@ def dataset_difference(dataset, remove):
    Dataset whose overlap will be removed.
  """
  remove_ids = set(remove.ids)
  keep_inds = [ind for ind in range(len(dataset))
               if dataset.ids[ind] not in remove_ids]
  keep_inds = [
      ind for ind in range(len(dataset)) if dataset.ids[ind] not in remove_ids
  ]

  # Remove support indices
  X = dataset.X[keep_inds]
@@ -50,6 +53,7 @@ def dataset_difference(dataset, remove):

  return NumpyDataset(X, y, w, ids)


def get_task_dataset_minus_support(dataset, support, task):
  """Gets data for specified task, minus support points.

@@ -66,8 +70,9 @@ def get_task_dataset_minus_support(dataset, support, task):
    Task number of task to select.
  """
  support_ids = set(support.ids)
  non_support_inds = [ind for ind in range(len(dataset))
                      if dataset.ids[ind] not in support_ids]
  non_support_inds = [
      ind for ind in range(len(dataset)) if dataset.ids[ind] not in support_ids
  ]

  # Remove support indices
  X = dataset.X[non_support_inds]
@@ -85,6 +90,7 @@ def get_task_dataset_minus_support(dataset, support, task):

  return NumpyDataset(X_task, y_task, w_task, ids_task)


def get_task_dataset(dataset, task):
  """Selects out entries for a particular task."""
  X, y, w, ids = dataset.X, dataset.y, dataset.w, dataset.ids
@@ -98,6 +104,7 @@ def get_task_dataset(dataset, task):

  return NumpyDataset(X_task, y_task, w_task, ids_task)


def get_task_test(dataset, n_episodes, n_test, task, log_every_n=50):
  """Gets test set from specified task.

@@ -135,6 +142,7 @@ def get_task_test(dataset, n_episodes, n_test, task, log_every_n=50):
    tests.append(NumpyDataset(X_batch, y_batch, w_batch, ids_batch))
  return tests


def get_single_task_test(dataset, batch_size, task, replace=True):
  """Gets test set from specified task.

@@ -156,7 +164,6 @@ def get_single_task_test(dataset, batch_size, task, replace=True):
  return NumpyDataset(X_batch, y_batch, w_batch, ids_batch)



def get_single_task_support(dataset, n_pos, n_neg, task, replace=True):
  """Generates one support set purely for specified task.
  
@@ -180,6 +187,7 @@ def get_single_task_support(dataset, n_pos, n_neg, task, replace=True):
  """
  return get_task_support(dataset, 1, n_pos, n_neg, task)[0]


def get_task_support(dataset, n_episodes, n_pos, n_neg, task, log_every_n=50):
  """Generates one support set purely for specified task.
  
@@ -221,26 +229,23 @@ def get_task_support(dataset, n_episodes, n_pos, n_neg, task, log_every_n=50):
    # Handle one-d vs. non one-d feature matrices
    one_dimensional_features = (len(dataset.X.shape) == 1)
    if not one_dimensional_features:
      X = np.vstack(
          [dataset.X[pos_inds], dataset.X[neg_inds]])
      X = np.vstack([dataset.X[pos_inds], dataset.X[neg_inds]])
    else:
      X = np.concatenate(
          [dataset.X[pos_inds], dataset.X[neg_inds]])
    y = np.concatenate(
        [dataset.y[pos_inds, task], dataset.y[neg_inds, task]])
    w = np.concatenate(
        [dataset.w[pos_inds, task], dataset.w[neg_inds, task]])
    ids = np.concatenate(
        [dataset.ids[pos_inds], dataset.ids[neg_inds]])
      X = np.concatenate([dataset.X[pos_inds], dataset.X[neg_inds]])
    y = np.concatenate([dataset.y[pos_inds, task], dataset.y[neg_inds, task]])
    w = np.concatenate([dataset.w[pos_inds, task], dataset.w[neg_inds, task]])
    ids = np.concatenate([dataset.ids[pos_inds], dataset.ids[neg_inds]])
    supports.append(NumpyDataset(X, y, w, ids))
  return supports


class EpisodeGenerator(object):
  """Generates (support, test) pairs for episodic training.

  Precomputes all (support, test) pairs at construction. Allows to reduce
  overhead from computation.
  """

  def __init__(self, dataset, n_pos, n_neg, n_test, n_episodes_per_task):
    """
    Parameters
@@ -268,10 +273,10 @@ class EpisodeGenerator(object):
    self.task_episodes = {}

    for task in range(self.n_tasks):
      task_supports = get_task_support(
          self.dataset, n_episodes_per_task, n_pos, n_neg, task)
      task_tests = get_task_test(
          self.dataset, n_episodes_per_task, n_test, task)
      task_supports = get_task_support(self.dataset, n_episodes_per_task, n_pos,
                                       n_neg, task)
      task_tests = get_task_test(self.dataset, n_episodes_per_task, n_test,
                                 task)
      self.task_episodes[task] = (task_supports, task_tests)

    # Init the iterator
@@ -280,8 +285,8 @@ class EpisodeGenerator(object):
    self.task_num = 0
    self.trial_num = 0
    time_end = time.time()
    print("Constructing EpisodeGenerator took %s seconds"
          % str(time_end-time_start))
    print("Constructing EpisodeGenerator took %s seconds" %
          str(time_end - time_start))

  def __iter__(self):
    return self
@@ -317,6 +322,7 @@ class SupportGenerator(object):
  Iterates over tasks and trials. For each trial, picks one support from
  each task, and returns in a randomized order
  """

  def __init__(self, dataset, n_pos, n_neg, n_trials):
    """
    Parameters
@@ -360,7 +366,10 @@ class SupportGenerator(object):
      task = self.perm_tasks[self.task_num]  # Get id from permutation
      #support = self.supports[task][self.trial_num]
      support = get_single_task_support(
          self.dataset, n_pos=self.n_pos, n_neg=self.n_neg, task=task,
          self.dataset,
          n_pos=self.n_pos,
          n_neg=self.n_neg,
          task=task,
          replace=False)
      # Increment and update logic
      self.task_num += 1
@@ -372,4 +381,3 @@ class SupportGenerator(object):
      return (task, support)

  __next__ = next  # Python 3.X compatibility
+2 −2
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@ import deepchem as dc
import numpy as np
from sklearn.ensemble import RandomForestClassifier


class TestDrop(unittest.TestCase):
  """
  Test how loading of malformed compounds is handled.
@@ -21,8 +22,7 @@ class TestDrop(unittest.TestCase):

    current_dir = os.path.dirname(os.path.realpath(__file__))
    print("About to load emols dataset.")
    dataset_file = os.path.join(
        current_dir, "mini_emols.csv")
    dataset_file = os.path.join(current_dir, "mini_emols.csv")

    # Featurize emols dataset
    print("About to featurize datasets.")
+8 −8
Original line number Diff line number Diff line
@@ -74,13 +74,13 @@ class TestReload(unittest.TestCase):
    dataset_file = os.path.join(current_dir,
                                "../../../datasets/mini_muv.csv.gz")
    print("Running experiment for first time without reload.")
    (len_train, len_valid, len_test) = self._run_muv_experiment(dataset_file,
                                                                reload)
    (len_train, len_valid, len_test) = self._run_muv_experiment(
        dataset_file, reload)

    print("Running experiment for second time with reload.")
    reload = True
    (len_reload_train, len_reload_valid, len_reload_test) = (
        self._run_muv_experiment(dataset_file, reload))
    (len_reload_train, len_reload_valid,
     len_reload_test) = (self._run_muv_experiment(dataset_file, reload))
    assert len_train == len_reload_train
    assert len_valid == len_reload_valid
    assert len_test == len_reload_valid
@@ -92,12 +92,12 @@ class TestReload(unittest.TestCase):
    dataset_file = os.path.join(current_dir,
                                "../../../datasets/mini_muv.csv.gz")
    print("Running experiment for first time with reload.")
    (len_train, len_valid, len_test) = self._run_muv_experiment(dataset_file,
                                                                reload)
    (len_train, len_valid, len_test) = self._run_muv_experiment(
        dataset_file, reload)

    print("Running experiment for second time with reload.")
    (len_reload_train, len_reload_valid, len_reload_test) = (
        self._run_muv_experiment(dataset_file, reload))
    (len_reload_train, len_reload_valid,
     len_reload_test) = (self._run_muv_experiment(dataset_file, reload))
    assert len_train == len_reload_train
    assert len_valid == len_reload_valid
    assert len_test == len_reload_valid
Loading