Commit 08fdf94c authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Preliminary commit for arbitrary object featurizers

parent 494cce1f
Loading
Loading
Loading
Loading
+152 −76
Original line number Diff line number Diff line
@@ -73,11 +73,19 @@ class Dataset(object):
      # TODO(rbharath): This is a hack. clean up.
      if not len(df):
        return None
      ##################################################### DEBUG
      if hasattr(featurizer, "dtype"):
        dtype = featurizer.dtype
        compute_feature_statistics = False
      else:
        dtype = float
        compute_feature_statistics = True
      ##################################################### DEBUG
      ############################################################## TIMING
      time1 = time.time()
      ############################################################## TIMING
      ids, X, y, w = convert_df_to_numpy(df, feature_type, tasks, mol_id_field,
                                         verbosity)
                                         dtype, verbosity)
      ############################################################## TIMING
      time2 = time.time()
      log("TIMING: convert_df_to_numpy took %0.3f s" % (time2-time1), verbosity)
@@ -88,7 +96,9 @@ class Dataset(object):
      assert X.shape[0] == y.shape[0]
      assert y.shape == w.shape
      assert len(ids) == X.shape[0]
    return Dataset.write_data_to_disk(data_dir, basename, tasks, X, y, w, ids)
    return Dataset.write_data_to_disk(
        data_dir, basename, tasks, X, y, w, ids,
        compute_feature_statistics=compute_feature_statistics)

  @staticmethod
  def construct_metadata(metadata_entries):
@@ -107,7 +117,11 @@ class Dataset(object):
    return metadata_df

  @staticmethod
  def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None):
  ############################################################## DEBUG
  #def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None):
  def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None,
                         compute_feature_statistics=True):
  ############################################################## DEBUG
    out_X = "%s-X.joblib" % basename
    out_X_transformed = "%s-X-transformed.joblib" % basename
    out_X_sums = "%s-X_sums.joblib" % basename
@@ -125,6 +139,7 @@ class Dataset(object):
    if X is not None:
      save_to_disk(X, os.path.join(data_dir, out_X))
      save_to_disk(X, os.path.join(data_dir, out_X_transformed))
      if compute_feature_statistics:
        X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
        save_to_disk(X_sums, os.path.join(data_dir, out_X_sums))
        save_to_disk(X_sum_squares, os.path.join(data_dir, out_X_sum_squares))
@@ -449,7 +464,12 @@ class Dataset(object):
    """Sets verbosity."""
    self.verbosity = new_verbosity

  def select(self, select_dir, indices):
  ####################################################### DEBUG
  # TODO(rbharath): This ad-hoc change for general object types seems kludgey.
  # Is there a more principled approach to support general objects?
  #def select(self, select_dir, indices):
  def select(self, select_dir, indices, compute_feature_statistics=False):
  ####################################################### DEBUG
    """Creates a new dataset from a selection of indices from self."""
    if not os.path.exists(select_dir):
      os.makedirs(select_dir)
@@ -477,9 +497,13 @@ class Dataset(object):
      w_sel = w[shard_indices]
      ids_sel = ids[shard_indices]
      basename = "dataset-%d" % shard_num
      ############################################################ DEBUG
      metadata_rows.append(
          Dataset.write_data_to_disk(select_dir, basename, tasks,
                                     X_sel, y_sel, w_sel, ids_sel))
          Dataset.write_data_to_disk(
              select_dir, basename, tasks,
              X_sel, y_sel, w_sel, ids_sel,
              compute_feature_statistics=compute_feature_statistics))
      ############################################################ DEBUG
      # Updating counts
      indices_count += num_shard_elts
      count += shard_len
@@ -603,20 +627,39 @@ class Dataset(object):
    """Return pandas series of label stds."""
    return self.metadata_df["y_stds"]

  def get_statistics(self):
  ################################################## DEBUG
  #def get_statistics(self):
  def get_statistics(self, X_stats=True, y_stats=True):
  ################################################## DEBUG
    """Computes and returns statistics of this dataset"""
    if len(self) == 0:
      return None, None, None, None
    self.update_moments()
    ################################################## DEBUG
    #self.update_moments()
    self.update_moments(X_stats, y_stats)
    ################################################## DEBUG
    df = self.metadata_df
    X_means, X_stds, y_means, y_stds = self._compute_mean_and_std(df)
    return X_means, X_stds, y_means, y_stds

  def _compute_mean_and_std(self, df):
    ################################################## DEBUG
    #X_means, X_stds, y_means, y_stds = self._compute_mean_and_std(df)
    if X_stats:
      X_means, X_stds = self._compute_mean_and_std(df, X_stats, y_stats)
      return X_means, X_stds
    elif y_stats:
      y_means, y_stds = self._compute_mean_and_std(df, X_stats, y_stats)
      return y_means, y_stds
    ################################################## DEBUG
    #return X_means, X_stds, y_means, y_stds

  ################################################## DEBUG
  #def _compute_mean_and_std(self, df):
  def _compute_mean_and_std(self, df, X_stats, y_stats):
  ################################################## DEBUG
    """
    Compute means/stds of X/y from sums/sum_squares of tensors.
    """

  ################################################## DEBUG
    if X_stats:
      X_sums = []
      X_sum_squares = []
      X_n = []
@@ -637,7 +680,11 @@ class Dataset(object):
      overall_X_sum_squares = np.sum(X_sum_squares, axis=0)

      X_vars = (overall_X_sum_squares - np.square(overall_X_sums)/n)/(n)
      return overall_X_means, np.sqrt(X_vars)
  ################################################## DEBUG

  ################################################## DEBUG
    if y_stats:
      y_sums = []
      y_sum_squares = []
      y_n = []
@@ -655,17 +702,29 @@ class Dataset(object):
      y_sum_squares = np.vstack(y_sum_squares)
      y_means = np.sum(y_sums, axis=0)/y_n
      y_vars = np.sum(y_sum_squares, axis=0)/y_n - np.square(y_means)
    return overall_X_means, np.sqrt(X_vars), y_means, np.sqrt(y_vars)
      return y_means, np.sqrt(y_vars)
  ################################################## DEBUG
  
  def update_moments(self):
  ########################################################## DEBUG
  #def update_moments(self):
  def update_moments(self, X_stats, y_stats):
  ########################################################## DEBUG
    """Re-compute statistics of this dataset during transformation"""
    df = self.metadata_df
    self._update_mean_and_std(df)
    ########################################################## DEBUG
    #self._update_mean_and_std(df)
    self._update_mean_and_std(df, X_stats, y_stats)
    ########################################################## DEBUG

  def _update_mean_and_std(self, df):
  ########################################################## DEBUG
  #def _update_mean_and_std(self, df):
  def _update_mean_and_std(self, df, X_stats, y_stats):
  ########################################################## DEBUG
    """
    Compute means/stds of X/y from sums/sum_squares of tensors.
    """
    ########################################################## DEBUG
    if X_stats:
      X_transform = []
      for _, row in df.iterrows():
        Xt = load_from_disk(os.path.join(self.data_dir, row['X-transformed']))
@@ -673,7 +732,10 @@ class Dataset(object):
        Xss = np.sum(np.square(Xt),axis=0)
        save_to_disk(Xs, os.path.join(self.data_dir, row['X_sums']))
        save_to_disk(Xss, os.path.join(self.data_dir, row['X_sum_squares']))
    ########################################################## DEBUG

    ########################################################## DEBUG
    if y_stats:
      y_transform = []
      for _, row in df.iterrows():
        yt = load_from_disk(os.path.join(self.data_dir, row['y-transformed']))
@@ -681,6 +743,7 @@ class Dataset(object):
        yss = np.sum(np.square(yt),axis=0)
        save_to_disk(ys, os.path.join(self.data_dir, row['y_sums']))
        save_to_disk(yss, os.path.join(self.data_dir, row['y_sum_squares']))
    ########################################################## DEBUG

  def get_grad_statistics(self):
    """Computes and returns statistics of this dataset
@@ -749,7 +812,8 @@ def compute_sums_and_nb_sample(tensor, W=None):

# The following are all associated with Dataset, but are separate functions to
# make it easy to use multiprocessing.
def convert_df_to_numpy(df, feature_type, tasks, mol_id_field, verbosity=None):
def convert_df_to_numpy(df, feature_type, tasks, mol_id_field, dtype,
                        verbosity=None):
  """Transforms a dataframe containing deepchem input into numpy arrays"""
  if feature_type not in df.keys():
    raise ValueError(
@@ -808,4 +872,16 @@ def convert_df_to_numpy(df, feature_type, tasks, mol_id_field, verbosity=None):
  w = w[valid_inds]
  # Adding this assertion in to avoid ill-formed outputs.
  assert len(sorted_ids) == len(x) == len(y) == len(w)
  ############################################################## DEBUG
  #return sorted_ids, x.astype(float), y.astype(float), w.astype(float)
  print("x[0]")
  print(x[0])
  print("type(x)")
  print(type(x))
  if dtype == float:
    return sorted_ids, x.astype(float), y.astype(float), w.astype(float)
  elif dtype == object:
    return sorted_ids, x, y.astype(float), w.astype(float)
  else:
    raise ValueError("Unrecognized dtype for featurizer.")
  ############################################################## DEBUG
+7 −2
Original line number Diff line number Diff line
@@ -173,6 +173,9 @@ class NeighborListAtomicCoordinates(Featurizer):
    if neighbor_cutoff <= 0:
      raise ValueError("neighbor_cutoff must be positive value.")
    self.neighbor_cutoff = neighbor_cutoff
    # Type of data created by this featurizer
    self.dtype = object
    self.coordinates_featurizer = AtomicCoordinates()

  def _featurize(self, mol):
    """
@@ -182,6 +185,8 @@ class NeighborListAtomicCoordinates(Featurizer):
    ----------
    """
    N = mol.GetNumAtoms()
    # TODO(rbharath): Should this return a list?
    bohr_coords = self.coordinates_featurizer._featurize(mol)[0]
    coords = get_coords(mol)

    x_bins, y_bins, z_bins = get_cells(coords, self.neighbor_cutoff)
@@ -217,6 +222,6 @@ class NeighborListAtomicCoordinates(Featurizer):
          if np.linalg.norm(coords[atom] - coords[neighbor_atom]) < self.neighbor_cutoff:
            neighbor_list[atom].add(neighbor_atom)
          
      neighbor_list[atom] = list(neighbor_list[atom])
      neighbor_list[atom] = sorted(list(neighbor_list[atom]))
        
    return neighbor_list
    return (bohr_coords, neighbor_list)
+0 −1
Original line number Diff line number Diff line
@@ -194,4 +194,3 @@ class TestAtomicCoordinates(unittest.TestCase):
    nblist = nblist_featurizer._featurize(self.mol)
    for atom in range(N):
      assert len(nblist[atom]) == N-1
    
+7 −4
Original line number Diff line number Diff line
@@ -209,7 +209,10 @@ class TensorflowGraph(object):
          if shuffle:
            log("About to shuffle dataset before epoch start.", self.verbosity)
            dataset.shuffle()
          for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(batch_size):
          ####################################################################### DEBUG
          for ind, (X_b, y_b, w_b, ids_b) in enumerate(dataset.iterbatches(batch_size)):
            print("On batch %d" % ind)
          ####################################################################### DEBUG
            # Run training op.
            feed_dict = self.construct_feed_dict(X_b, y_b, w_b, ids_b)
            fetches = self.output + [
@@ -370,9 +373,9 @@ class TensorflowGraph(object):
      # allow_soft_placement=True allows ops without a GPU implementation
      # to run on the CPU instead.
      config = tf.ConfigProto(allow_soft_placement=True)
      ################################################################# DEBUG
      config.gpu_options.allow_growth = True
      ################################################################# DEBUG
      ################################################################## DEBUG
      #config.gpu_options.allow_growth = True
      ################################################################## DEBUG
      self._shared_session = tf.Session(config=config)
    return self._shared_session

+13 −7
Original line number Diff line number Diff line
@@ -105,13 +105,19 @@ class NormalizationTransformer(Transformer):
    super(NormalizationTransformer, self).__init__(
        transform_X=transform_X, transform_y=transform_y,
        transform_w=transform_w, dataset=dataset)
    X_means, X_stds, y_means, y_stds = dataset.get_statistics()
    #################################################################### DEBUG
    #X_means, X_stds, y_means, y_stds = dataset.get_statistics()
    if transform_X:
      X_means, X_stds = dataset.get_statistics(X_stats=True, y_stats=False)
      self.X_means = X_means 
      self.X_stds = X_stds
    elif transform_y:
      y_means, y_stds = dataset.get_statistics(X_stats=False, y_stats=True)
      self.y_means = y_means 
      # Control for pathological case with no variance.
      y_stds[y_stds == 0] = 1.
      self.y_stds = y_stds
    #################################################################### DEBUG
    self.transform_gradients = transform_gradients
    if self.transform_gradients:
      true_grad, ydely_means = dataset.get_grad_statistics()