Commit d32d9990 authored by evanfeinberg's avatar evanfeinberg
Browse files

miscellaneous OOP fixes

parent ecda59f1
Loading
Loading
Loading
Loading
+11 −15
Original line number Diff line number Diff line
@@ -122,25 +122,21 @@ class Model(object):
    """
    Fits a model on data in a Dataset object.
    """
    # TODO(rbharath/enf): This GPU_RAM is black magic. Needs to be removed/made
    # more general.
    MAX_GPU_RAM = float(691007488/50)
    # TODO(rbharath/enf): We need a structured way to deal with potential GPU
    #                     memory overflows.
    batch_size = self.model_params["batch_size"]
    for epoch in range(self.model_params["nb_epoch"]):
      print("Starting epoch %s" % str(epoch+1))
      for i, (X, y, w, _) in enumerate(dataset.itershards()):
        print("Training on batch-%s/epoch-%s" % (str(i+1), str(epoch+1)))
        if sys.getsizeof(X) > MAX_GPU_RAM:
          nb_block = float(sys.getsizeof(X))/MAX_GPU_RAM
        nb_sample = np.shape(X)[0]
          interval_points = np.linspace(nb_sample,nb_block+1).astype(int)
        interval_points = np.linspace(0,nb_sample, np.ceil(float(nb_sample)/batch_size)+1).astype(int)
        for j in range(len(interval_points)-1):
          indices = range(interval_points[j],interval_points[j+1])
          X_batch = X[indices,:]
          y_batch = y[indices]
          w_batch = w[indices]
          self.fit_on_batch(X_batch, y_batch, w_batch)
        else:
          self.fit_on_batch(X, y, w)

  # TODO(rbharath): What does this function do when y is not provided. Suspect
  # it breaks. Need to fix.
+2 −2
Original line number Diff line number Diff line
@@ -29,8 +29,8 @@ class DockingDNN(KerasModel):
  """
  Wrapper class for fitting 3D convolutional networks for deep docking.
  """
  def __init__(self, task_types, model_params, initialize_raw_model=True):
    super(DockingDNN, self).__init__(task_types, model_params, initialize_raw_model)
  def __init__(self, model_type, task_types, model_params, initialize_raw_model=True):
    super(DockingDNN, self).__init__(model_type, task_types, model_params, initialize_raw_model)
    if initialize_raw_model:
      (axis_length, _, _, n_channels) = model_params["data_shape"]
      self.input_shape = (n_channels, 
+4 −0
Original line number Diff line number Diff line
@@ -59,7 +59,11 @@ class SklearnModel(Model):
      Xs.append(X)
      ys.append(y)
    X = np.concatenate(Xs)
    print("np.shape(X)")
    print(np.shape(X))
    y = np.concatenate(ys)
    print("np.shape(y)")
    print(np.shape(y))
    self.raw_model.fit(X, y)

  def predict_on_batch(self, X):
+4 −3
Original line number Diff line number Diff line
@@ -289,13 +289,14 @@ def df_to_numpy(df, feature_types):
    tensors.append(features)
  x = np.stack(tensors)

  # Remove entries with missing labels
  nonzero_labels = np.squeeze(np.where(np.squeeze(y)!=''))
  #TODO(enf/rbharath): This is not compatible with multitask use case.
  nonzero_labels = np.arange(len(y))[[val != '' for val in y]]
  #nonzero_labels = np.squeeze(np.where(y!=''))
  x = x[nonzero_labels]
  y = y[nonzero_labels]
  w = w[nonzero_labels]
  nonzero_rows = []
  for nonzero_ind in np.squeeze(nonzero_labels):
  for nonzero_ind in nonzero_labels:
    nonzero_rows.append(df.iloc[nonzero_ind])
  nonzero_df = pd.DataFrame(nonzero_rows)
  sorted_ids = nonzero_df["mol_id"]
+18 −19
Original line number Diff line number Diff line
@@ -31,7 +31,7 @@ def _process_field(val):
  if isinstance(val, float) or isinstance(val, np.ndarray):
    return val
  elif isinstance(val, list):
    return [process_field(elt) for elt in val]
    return [_process_field(elt) for elt in val]
  elif isinstance(val, str):
    try:
      return float(val)
@@ -76,12 +76,7 @@ class DataFeaturizer(object):
      rows.append(self._process_raw_sample(input_type, row, fields))
    df = self._standardize_df(pd.DataFrame(rows))
    for feature_type in feature_types:
      self._featurize_df(df, feature_type)
    print("featurize()")
    print("len(df)")
    print(len(df))
    print("out")
    print(out)
      self._featurize_df(df, rows, feature_type)
    save_to_disk(df, out)
    df_loaded = load_from_disk(out)

@@ -109,7 +104,7 @@ class DataFeaturizer(object):
      filename, file_extension = os.path.splitext(filename)
    if file_extension == ".csv":
      return "csv"
    elif file_extension == ".pkl":
    elif file_extension in [".pkl", ".joblib"]:
      return "pandas"
    elif file_extension == ".sdf":
      return "sdf"
@@ -129,7 +124,7 @@ class DataFeaturizer(object):
            yield row
    elif input_type == "pandas":
      dataframe = load_from_disk(input_file)
      for row in dataframe.iterrows():
      for _, row in dataframe.iterrows():
        yield row
    elif input_type == "sdf":
      if ".gz" in input_file:
@@ -154,8 +149,6 @@ class DataFeaturizer(object):
        data[field] = _process_field(row[ind])
      return data
    elif input_type == "pandas":
      # pandas rows are tuples (row_num, data)
      row = row[1]
      for field in fields:
        data[field] = _process_field(row[field])
    elif input_type == "sdf":
@@ -187,22 +180,22 @@ class DataFeaturizer(object):

    return df

  def _featurize_df(self, df, feature_type):
  def _featurize_df(self, df, rows, feature_type):
    """Generates circular fingerprints for dataset."""
    if feature_type == "user-specified-features":
      if self.user_specified_features is not None:
        if self.verbose:
          print("Adding user-defined features.")
        features_data = []
        for row in df.iterrows():
        for row in rows:
          # pandas rows are tuples (row_num, row_data)
          row, feature_list = row[1], []
          for feature in user_specified_features:
            feature_list.append(row[feature])
          features_data.append({"row": np.array(feature_list)})
          feature_list = []
          for feature_name in self.user_specified_features:
            feature_list.append(row[feature_name])
          features_data.append({feature_type: np.array(feature_list)})
        df[feature_type] = pd.DataFrame(features_data)
        return
    elif feature_type in ["ECFP", "RDKIT-descriptors"]:
    elif feature_type in ["ECFP", "RDKIT-descriptors", "NNScore"]:
      if feature_type == "ECFP":
        if self.verbose:
          print("Generating ECFP circular fingerprints.")
@@ -211,6 +204,8 @@ class DataFeaturizer(object):
        if self.verbose:
          print("Generating RDKIT descriptors.")
        featurizer = SimpleDescriptors()
      elif feature_type == "NNScore":
        pass
      features = []
      sample_smiles = df["smiles"].tolist()
      for ind, smiles in enumerate(sample_smiles):
@@ -301,8 +296,12 @@ class FeaturizedSamples(object):
      df = load_from_disk(dataset_file)
      compound_ids = list(df["mol_id"])
      smiles = list(df["smiles"])
      if "split" in df.keys():
        splits = list(df["split"])
      else:
        splits = [None] * len(smiles)
      compound_rows += [list(elt) for elt in zip(compound_ids, smiles, splits)]

    compounds_df = pd.DataFrame(compound_rows,
                                columns=("mol_id", "smiles", "split"))
    return compounds_df
Loading