Commit c1302d05 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #219 from rbharath/object_featurizers

Featurizers that return arbitrary objects
parents 1aafd057 1e57a4f9
Loading
Loading
Loading
Loading
+115 −75
Original line number Diff line number Diff line
@@ -73,11 +73,17 @@ class Dataset(object):
      # TODO(rbharath): This is a hack. clean up.
      if not len(df):
        return None
      if hasattr(featurizer, "dtype"):
        dtype = featurizer.dtype
        compute_feature_statistics = False
      else:
        dtype = float
        compute_feature_statistics = True
      ############################################################## TIMING
      time1 = time.time()
      ############################################################## TIMING
      ids, X, y, w = convert_df_to_numpy(df, feature_type, tasks, mol_id_field,
                                         verbosity)
                                         dtype, verbosity)
      ############################################################## TIMING
      time2 = time.time()
      log("TIMING: convert_df_to_numpy took %0.3f s" % (time2-time1), verbosity)
@@ -88,7 +94,9 @@ class Dataset(object):
      assert X.shape[0] == y.shape[0]
      assert y.shape == w.shape
      assert len(ids) == X.shape[0]
    return Dataset.write_data_to_disk(data_dir, basename, tasks, X, y, w, ids)
    return Dataset.write_data_to_disk(
        data_dir, basename, tasks, X, y, w, ids,
        compute_feature_statistics=compute_feature_statistics)

  @staticmethod
  def construct_metadata(metadata_entries):
@@ -107,7 +115,8 @@ class Dataset(object):
    return metadata_df

  @staticmethod
  def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None):
  def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None,
                         compute_feature_statistics=True):
    out_X = "%s-X.joblib" % basename
    out_X_transformed = "%s-X-transformed.joblib" % basename
    out_X_sums = "%s-X_sums.joblib" % basename
@@ -125,6 +134,11 @@ class Dataset(object):
    if X is not None:
      save_to_disk(X, os.path.join(data_dir, out_X))
      save_to_disk(X, os.path.join(data_dir, out_X_transformed))
      if compute_feature_statistics:
        ########################################################## DEBUG
        print("compute_feature_statistics")
        print(compute_feature_statistics)
        ########################################################## DEBUG
        X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
        save_to_disk(X_sums, os.path.join(data_dir, out_X_sums))
        save_to_disk(X_sum_squares, os.path.join(data_dir, out_X_sum_squares))
@@ -449,7 +463,9 @@ class Dataset(object):
    """Sets verbosity."""
    self.verbosity = new_verbosity

  def select(self, select_dir, indices):
  # TODO(rbharath): This change for general object types seems a little
  # kludgey.  Is there a more principled approach to support general objects?
  def select(self, select_dir, indices, compute_feature_statistics=False):
    """Creates a new dataset from a selection of indices from self."""
    if not os.path.exists(select_dir):
      os.makedirs(select_dir)
@@ -478,8 +494,10 @@ class Dataset(object):
      ids_sel = ids[shard_indices]
      basename = "dataset-%d" % shard_num
      metadata_rows.append(
          Dataset.write_data_to_disk(select_dir, basename, tasks,
                                     X_sel, y_sel, w_sel, ids_sel))
          Dataset.write_data_to_disk(
              select_dir, basename, tasks,
              X_sel, y_sel, w_sel, ids_sel,
              compute_feature_statistics=compute_feature_statistics))
      # Updating counts
      indices_count += num_shard_elts
      count += shard_len
@@ -603,20 +621,33 @@ class Dataset(object):
    """Return pandas series of label stds."""
    return self.metadata_df["y_stds"]

  def get_statistics(self):
  def get_statistics(self, X_stats=True, y_stats=True):
    """Computes and returns statistics of this dataset"""
    if len(self) == 0:
      return None, None, None, None
    self.update_moments()
    self.update_moments(X_stats, y_stats)
    df = self.metadata_df
    X_means, X_stds, y_means, y_stds = self._compute_mean_and_std(df)
    if X_stats and not y_stats:
      X_means, X_stds = self._compute_mean_and_std(df, X_stats, y_stats)
      return X_means, X_stds
    elif y_stats and not X_stats:
      y_means, y_stds = self._compute_mean_and_std(df, X_stats, y_stats)
      return y_means, y_stds
    elif X_stats and y_stats:
      X_means, X_stds = self._compute_mean_and_std(
          df, X_stats=True, y_stats=False)
      y_means, y_stds = self._compute_mean_and_std(
          df, X_stats=False, y_stats=True)
      return X_means, X_stds, y_means, y_stds
    else:
      return None

  def _compute_mean_and_std(self, df):
  def _compute_mean_and_std(self, df, X_stats, y_stats):
    """
    Compute means/stds of X/y from sums/sum_squares of tensors.
    """

    if X_stats:
      X_sums = []
      X_sum_squares = []
      X_n = []
@@ -637,7 +668,9 @@ class Dataset(object):
      overall_X_sum_squares = np.sum(X_sum_squares, axis=0)

      X_vars = (overall_X_sum_squares - np.square(overall_X_sums)/n)/(n)
      return overall_X_means, np.sqrt(X_vars)

    if y_stats:
      y_sums = []
      y_sum_squares = []
      y_n = []
@@ -655,17 +688,18 @@ class Dataset(object):
      y_sum_squares = np.vstack(y_sum_squares)
      y_means = np.sum(y_sums, axis=0)/y_n
      y_vars = np.sum(y_sum_squares, axis=0)/y_n - np.square(y_means)
    return overall_X_means, np.sqrt(X_vars), y_means, np.sqrt(y_vars)
      return y_means, np.sqrt(y_vars)
  
  def update_moments(self):
  def update_moments(self, X_stats, y_stats):
    """Re-compute statistics of this dataset during transformation"""
    df = self.metadata_df
    self._update_mean_and_std(df)
    self._update_mean_and_std(df, X_stats, y_stats)

  def _update_mean_and_std(self, df):
  def _update_mean_and_std(self, df, X_stats, y_stats):
    """
    Compute means/stds of X/y from sums/sum_squares of tensors.
    """
    if X_stats:
      X_transform = []
      for _, row in df.iterrows():
        Xt = load_from_disk(os.path.join(self.data_dir, row['X-transformed']))
@@ -674,6 +708,7 @@ class Dataset(object):
        save_to_disk(Xs, os.path.join(self.data_dir, row['X_sums']))
        save_to_disk(Xss, os.path.join(self.data_dir, row['X_sum_squares']))

    if y_stats:
      y_transform = []
      for _, row in df.iterrows():
        yt = load_from_disk(os.path.join(self.data_dir, row['y-transformed']))
@@ -717,7 +752,6 @@ class Dataset(object):

    return grad, ydely_means


def compute_sums_and_nb_sample(tensor, W=None):
  """
  Computes sums, squared sums of tensor along axis 0.
@@ -749,7 +783,8 @@ def compute_sums_and_nb_sample(tensor, W=None):

# The following are all associated with Dataset, but are separate functions to
# make it easy to use multiprocessing.
def convert_df_to_numpy(df, feature_type, tasks, mol_id_field, verbosity=None):
def convert_df_to_numpy(df, feature_type, tasks, mol_id_field, dtype,
                        verbosity=None):
  """Transforms a dataframe containing deepchem input into numpy arrays"""
  if feature_type not in df.keys():
    raise ValueError(
@@ -808,4 +843,9 @@ def convert_df_to_numpy(df, feature_type, tasks, mol_id_field, verbosity=None):
  w = w[valid_inds]
  # Adding this assertion in to avoid ill-formed outputs.
  assert len(sorted_ids) == len(x) == len(y) == len(w)
  if dtype == float:
    return sorted_ids, x.astype(float), y.astype(float), w.astype(float)
  elif dtype == object:
    return sorted_ids, x, y.astype(float), w.astype(float)
  else:
    raise ValueError("Unrecognized dtype for featurizer.")
+7 −2
Original line number Diff line number Diff line
@@ -173,6 +173,9 @@ class NeighborListAtomicCoordinates(Featurizer):
    if neighbor_cutoff <= 0:
      raise ValueError("neighbor_cutoff must be positive value.")
    self.neighbor_cutoff = neighbor_cutoff
    # Type of data created by this featurizer
    self.dtype = object
    self.coordinates_featurizer = AtomicCoordinates()

  def _featurize(self, mol):
    """
@@ -182,6 +185,8 @@ class NeighborListAtomicCoordinates(Featurizer):
    ----------
    """
    N = mol.GetNumAtoms()
    # TODO(rbharath): Should this return a list?
    bohr_coords = self.coordinates_featurizer._featurize(mol)[0]
    coords = get_coords(mol)

    x_bins, y_bins, z_bins = get_cells(coords, self.neighbor_cutoff)
@@ -217,6 +222,6 @@ class NeighborListAtomicCoordinates(Featurizer):
          if np.linalg.norm(coords[atom] - coords[neighbor_atom]) < self.neighbor_cutoff:
            neighbor_list[atom].add(neighbor_atom)
          
      neighbor_list[atom] = list(neighbor_list[atom])
      neighbor_list[atom] = sorted(list(neighbor_list[atom]))
        
    return neighbor_list
    return (bohr_coords, neighbor_list)
+3 −4
Original line number Diff line number Diff line
@@ -159,7 +159,7 @@ class TestAtomicCoordinates(unittest.TestCase):
    x_bins, y_bins, z_bins = get_cells(coords, nblist_featurizer.neighbor_cutoff)

    nblist_featurizer = NeighborListAtomicCoordinates()
    nblist = nblist_featurizer._featurize(self.mol)
    nblist = nblist_featurizer._featurize(self.mol)[1]
    assert isinstance(nblist, dict)
    assert len(nblist.keys()) == N
    for (atom, neighbors) in nblist.items():
@@ -185,13 +185,12 @@ class TestAtomicCoordinates(unittest.TestCase):

    # Test with cutoff 0 angstroms. There should be no neighbors in this case.
    nblist_featurizer = NeighborListAtomicCoordinates(neighbor_cutoff=.1)
    nblist = nblist_featurizer._featurize(self.mol)
    nblist = nblist_featurizer._featurize(self.mol)[1]
    for atom in range(N):
      assert len(nblist[atom]) == 0

    # Test with cutoff 100 angstroms. Everything should be neighbors now.
    nblist_featurizer = NeighborListAtomicCoordinates(neighbor_cutoff=100)
    nblist = nblist_featurizer._featurize(self.mol)
    nblist = nblist_featurizer._featurize(self.mol)[1]
    for atom in range(N):
      assert len(nblist[atom]) == N-1
    
+7 −5
Original line number Diff line number Diff line
"""Helper operations and classes for general model building.

"""
from __future__ import print_function
from __future__ import division
@@ -209,7 +208,10 @@ class TensorflowGraph(object):
          if shuffle:
            log("About to shuffle dataset before epoch start.", self.verbosity)
            dataset.shuffle()
          for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(batch_size):
          ####################################################################### DEBUG
          for ind, (X_b, y_b, w_b, ids_b) in enumerate(dataset.iterbatches(batch_size)):
            log("On batch %d" % ind, self.verbosity)
          ####################################################################### DEBUG
            # Run training op.
            feed_dict = self.construct_feed_dict(X_b, y_b, w_b, ids_b)
            fetches = self.output + [
@@ -370,9 +372,9 @@ class TensorflowGraph(object):
      # allow_soft_placement=True allows ops without a GPU implementation
      # to run on the CPU instead.
      config = tf.ConfigProto(allow_soft_placement=True)
      ################################################################# DEBUG
      config.gpu_options.allow_growth = True
      ################################################################# DEBUG
      ################################################################## DEBUG
      #config.gpu_options.allow_growth = True
      ################################################################## DEBUG
      self._shared_session = tf.Session(config=config)
    return self._shared_session

+2 −0
Original line number Diff line number Diff line
@@ -193,6 +193,7 @@ class TestModelAPI(TestAPI):
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
@@ -218,6 +219,7 @@ class TestModelAPI(TestAPI):
                         mode="regression",
                         model_instance=RandomForestRegressor())
  

    # Fit trained model
    model.fit(train_dataset)
    model.save()
Loading