Commit 1e57a4f9 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Some test fixes and cleanup

parent 08fdf94c
Loading
Loading
Loading
Loading
+16 −52
Original line number Diff line number Diff line
@@ -73,14 +73,12 @@ class Dataset(object):
      # TODO(rbharath): This is a hack. clean up.
      if not len(df):
        return None
      ##################################################### DEBUG
      if hasattr(featurizer, "dtype"):
        dtype = featurizer.dtype
        compute_feature_statistics = False
      else:
        dtype = float
        compute_feature_statistics = True
      ##################################################### DEBUG
      ############################################################## TIMING
      time1 = time.time()
      ############################################################## TIMING
@@ -117,11 +115,8 @@ class Dataset(object):
    return metadata_df

  @staticmethod
  ############################################################## DEBUG
  #def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None):
  def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None,
                         compute_feature_statistics=True):
  ############################################################## DEBUG
    out_X = "%s-X.joblib" % basename
    out_X_transformed = "%s-X-transformed.joblib" % basename
    out_X_sums = "%s-X_sums.joblib" % basename
@@ -140,6 +135,10 @@ class Dataset(object):
      save_to_disk(X, os.path.join(data_dir, out_X))
      save_to_disk(X, os.path.join(data_dir, out_X_transformed))
      if compute_feature_statistics:
        ########################################################## DEBUG
        print("compute_feature_statistics")
        print(compute_feature_statistics)
        ########################################################## DEBUG
        X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
        save_to_disk(X_sums, os.path.join(data_dir, out_X_sums))
        save_to_disk(X_sum_squares, os.path.join(data_dir, out_X_sum_squares))
@@ -464,12 +463,9 @@ class Dataset(object):
    """Sets verbosity."""
    self.verbosity = new_verbosity

  ####################################################### DEBUG
  # TODO(rbharath): This ad-hoc change for general object types seems kludgey.
  # Is there a more principled approach to support general objects?
  #def select(self, select_dir, indices):
  # TODO(rbharath): This change for general object types seems a little
  # kludgey.  Is there a more principled approach to support general objects?
  def select(self, select_dir, indices, compute_feature_statistics=False):
  ####################################################### DEBUG
    """Creates a new dataset from a selection of indices from self."""
    if not os.path.exists(select_dir):
      os.makedirs(select_dir)
@@ -497,13 +493,11 @@ class Dataset(object):
      w_sel = w[shard_indices]
      ids_sel = ids[shard_indices]
      basename = "dataset-%d" % shard_num
      ############################################################ DEBUG
      metadata_rows.append(
          Dataset.write_data_to_disk(
              select_dir, basename, tasks,
              X_sel, y_sel, w_sel, ids_sel,
              compute_feature_statistics=compute_feature_statistics))
      ############################################################ DEBUG
      # Updating counts
      indices_count += num_shard_elts
      count += shard_len
@@ -627,38 +621,32 @@ class Dataset(object):
    """Return pandas series of label stds."""
    return self.metadata_df["y_stds"]

  ################################################## DEBUG
  #def get_statistics(self):
  def get_statistics(self, X_stats=True, y_stats=True):
  ################################################## DEBUG
    """Computes and returns statistics of this dataset"""
    if len(self) == 0:
      return None, None, None, None
    ################################################## DEBUG
    #self.update_moments()
    self.update_moments(X_stats, y_stats)
    ################################################## DEBUG
    df = self.metadata_df
    ################################################## DEBUG
    #X_means, X_stds, y_means, y_stds = self._compute_mean_and_std(df)
    if X_stats:
    if X_stats and not y_stats:
      X_means, X_stds = self._compute_mean_and_std(df, X_stats, y_stats)
      return X_means, X_stds
    elif y_stats:
    elif y_stats and not X_stats:
      y_means, y_stds = self._compute_mean_and_std(df, X_stats, y_stats)
      return y_means, y_stds
    ################################################## DEBUG
    #return X_means, X_stds, y_means, y_stds
    elif X_stats and y_stats:
      X_means, X_stds = self._compute_mean_and_std(
          df, X_stats=True, y_stats=False)
      y_means, y_stds = self._compute_mean_and_std(
          df, X_stats=False, y_stats=True)
      return X_means, X_stds, y_means, y_stds
    else:
      return None

  ################################################## DEBUG
  #def _compute_mean_and_std(self, df):
  def _compute_mean_and_std(self, df, X_stats, y_stats):
  ################################################## DEBUG
    """
    Compute means/stds of X/y from sums/sum_squares of tensors.
    """

  ################################################## DEBUG
    if X_stats:
      X_sums = []
      X_sum_squares = []
@@ -681,9 +669,7 @@ class Dataset(object):

      X_vars = (overall_X_sum_squares - np.square(overall_X_sums)/n)/(n)
      return overall_X_means, np.sqrt(X_vars)
  ################################################## DEBUG

  ################################################## DEBUG
    if y_stats:
      y_sums = []
      y_sum_squares = []
@@ -703,27 +689,16 @@ class Dataset(object):
      y_means = np.sum(y_sums, axis=0)/y_n
      y_vars = np.sum(y_sum_squares, axis=0)/y_n - np.square(y_means)
      return y_means, np.sqrt(y_vars)
  ################################################## DEBUG
  
  ########################################################## DEBUG
  #def update_moments(self):
  def update_moments(self, X_stats, y_stats):
  ########################################################## DEBUG
    """Re-compute statistics of this dataset during transformation"""
    df = self.metadata_df
    ########################################################## DEBUG
    #self._update_mean_and_std(df)
    self._update_mean_and_std(df, X_stats, y_stats)
    ########################################################## DEBUG

  ########################################################## DEBUG
  #def _update_mean_and_std(self, df):
  def _update_mean_and_std(self, df, X_stats, y_stats):
  ########################################################## DEBUG
    """
    Compute means/stds of X/y from sums/sum_squares of tensors.
    """
    ########################################################## DEBUG
    if X_stats:
      X_transform = []
      for _, row in df.iterrows():
@@ -732,9 +707,7 @@ class Dataset(object):
        Xss = np.sum(np.square(Xt),axis=0)
        save_to_disk(Xs, os.path.join(self.data_dir, row['X_sums']))
        save_to_disk(Xss, os.path.join(self.data_dir, row['X_sum_squares']))
    ########################################################## DEBUG

    ########################################################## DEBUG
    if y_stats:
      y_transform = []
      for _, row in df.iterrows():
@@ -743,7 +716,6 @@ class Dataset(object):
        yss = np.sum(np.square(yt),axis=0)
        save_to_disk(ys, os.path.join(self.data_dir, row['y_sums']))
        save_to_disk(yss, os.path.join(self.data_dir, row['y_sum_squares']))
    ########################################################## DEBUG

  def get_grad_statistics(self):
    """Computes and returns statistics of this dataset
@@ -780,7 +752,6 @@ class Dataset(object):

    return grad, ydely_means


def compute_sums_and_nb_sample(tensor, W=None):
  """
  Computes sums, squared sums of tensor along axis 0.
@@ -872,16 +843,9 @@ def convert_df_to_numpy(df, feature_type, tasks, mol_id_field, dtype,
  w = w[valid_inds]
  # Adding this assertion in to avoid ill-formed outputs.
  assert len(sorted_ids) == len(x) == len(y) == len(w)
  ############################################################## DEBUG
  #return sorted_ids, x.astype(float), y.astype(float), w.astype(float)
  print("x[0]")
  print(x[0])
  print("type(x)")
  print(type(x))
  if dtype == float:
    return sorted_ids, x.astype(float), y.astype(float), w.astype(float)
  elif dtype == object:
    return sorted_ids, x, y.astype(float), w.astype(float)
  else:
    raise ValueError("Unrecognized dtype for featurizer.")
  ############################################################## DEBUG
+3 −3
Original line number Diff line number Diff line
@@ -159,7 +159,7 @@ class TestAtomicCoordinates(unittest.TestCase):
    x_bins, y_bins, z_bins = get_cells(coords, nblist_featurizer.neighbor_cutoff)

    nblist_featurizer = NeighborListAtomicCoordinates()
    nblist = nblist_featurizer._featurize(self.mol)
    nblist = nblist_featurizer._featurize(self.mol)[1]
    assert isinstance(nblist, dict)
    assert len(nblist.keys()) == N
    for (atom, neighbors) in nblist.items():
@@ -185,12 +185,12 @@ class TestAtomicCoordinates(unittest.TestCase):

    # Test with cutoff 0 angstroms. There should be no neighbors in this case.
    nblist_featurizer = NeighborListAtomicCoordinates(neighbor_cutoff=.1)
    nblist = nblist_featurizer._featurize(self.mol)
    nblist = nblist_featurizer._featurize(self.mol)[1]
    for atom in range(N):
      assert len(nblist[atom]) == 0

    # Test with cutoff 100 angstroms. Everything should be neighbors now.
    nblist_featurizer = NeighborListAtomicCoordinates(neighbor_cutoff=100)
    nblist = nblist_featurizer._featurize(self.mol)
    nblist = nblist_featurizer._featurize(self.mol)[1]
    for atom in range(N):
      assert len(nblist[atom]) == N-1
+1 −2
Original line number Diff line number Diff line
"""Helper operations and classes for general model building.

"""
from __future__ import print_function
from __future__ import division
@@ -211,7 +210,7 @@ class TensorflowGraph(object):
            dataset.shuffle()
          ####################################################################### DEBUG
          for ind, (X_b, y_b, w_b, ids_b) in enumerate(dataset.iterbatches(batch_size)):
            print("On batch %d" % ind)
            log("On batch %d" % ind, self.verbosity)
          ####################################################################### DEBUG
            # Run training op.
            feed_dict = self.construct_feed_dict(X_b, y_b, w_b, ids_b)
+2 −0
Original line number Diff line number Diff line
@@ -193,6 +193,7 @@ class TestModelAPI(TestAPI):
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
@@ -218,6 +219,7 @@ class TestModelAPI(TestAPI):
                         mode="regression",
                         model_instance=RandomForestRegressor())
  

    # Fit trained model
    model.fit(train_dataset)
    model.save()
+11 −4
Original line number Diff line number Diff line
@@ -37,7 +37,8 @@ class Splitter(object):
  def train_valid_test_split(self, dataset, train_dir,
                             valid_dir, test_dir, frac_train=.8,
                             frac_valid=.1, frac_test=.1, seed=None,
                             log_every_n=1000):
                             log_every_n=1000,
                             compute_feature_statistics=True):
    """
    Splits self into train/validation/test sets.

@@ -48,12 +49,18 @@ class Splitter(object):
      dataset,
      frac_train=frac_train, frac_test=frac_test,
      frac_valid=frac_valid, log_every_n=log_every_n)
    train_dataset = dataset.select(train_dir, train_inds)
    train_dataset = dataset.select( 
        train_dir, train_inds,
        compute_feature_statistics=compute_feature_statistics)
    if valid_dir is not None:
      valid_dataset = dataset.select(valid_dir, valid_inds)
      valid_dataset = dataset.select(
          valid_dir, valid_inds,
          compute_feature_statistics=compute_feature_statistics)
    else:
      valid_dataset = None
    test_dataset = dataset.select(test_dir, test_inds)
    test_dataset = dataset.select(
        test_dir, test_inds,
        compute_feature_statistics=compute_feature_statistics)

    return train_dataset, valid_dataset, test_dataset

Loading