Commit c2c3cd44 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

More cleanup. Last commit before start of some API surgery.

parent 824d1c4a
Loading
Loading
Loading
Loading
+0 −2
Original line number Diff line number Diff line
@@ -249,7 +249,5 @@ def main():
  args = parse_args()
  args.func(args)



if __name__ == "__main__":
  main()
+0 −27
Original line number Diff line number Diff line
@@ -90,33 +90,6 @@ def generate_vs_util_features(df, name, out, smiles_endpoint, id_endpoint, featu
  with gzip.open(features, "wb") as f:
    pickle.dump(feature_df, f, pickle.HIGHEST_PROTOCOL)

'''
def generate_descriptors(df, name, out, smiles_endpoint, id_endpoint):
  """Generates molecular descriptors for dataset."""
  dataset_dir = os.path.join(out, name)
  descriptors_dir = os.path.join(dataset_dir, "descriptors")
  shards_dir = os.path.join(dataset_dir, "shards")
  descriptors = os.path.join(descriptors_dir,
      "%s-descriptors.pkl.gz" % name)

  descriptors_df = pd.DataFrame([]) 
  descriptors_df["smiles"] = df[[smiles_endpoint]]
  descriptors_df["scaffolds"] = df[[smiles_endpoint]].apply(
    functools.partial(generate_scaffold, smiles_endpoint=smiles_endpoint),
    axis=1)
  descriptors_df["mol_id"] = df[[id_endpoint]]
  mols = []
  for row in df.iterrows():
    # pandas rows are tuples (row_num, row_data)
    smiles = row[1][smiles_endpoint]
    mols.append(Chem.MolFromSmiles(smiles))
  featurizer = SimpleDescriptors()
  descriptors_df["features"] = pd.DataFrame([ {"features": feature} for feature in featurizer.featurize(mols)])

  with gzip.open(descriptors, "wb") as f:
    pickle.dump(descriptors_df, f, pickle.HIGHEST_PROTOCOL)
'''

def get_rows(input_file, input_type, delimiter):
  """Returns an iterator over all rows in input_file"""
  # TODO(rbharath): This function loads into memory, which can be painful. The
+0 −50
Original line number Diff line number Diff line
@@ -151,7 +151,6 @@ def dataset_to_numpy(dataset, feature_endpoint="fingerprint",
  n_samples = len(dataset.keys())
  sample_datapoint = dataset.itervalues().next()
  feature_shape = np.shape(sample_datapoint[feature_endpoint])
  print np.shape(feature_shape)
  
  #n_targets = 1 # TODO(rbharath): Generalize this later
  n_targets = len(sample_datapoint[labels_endpoint])
@@ -173,59 +172,10 @@ def dataset_to_numpy(dataset, feature_endpoint="fingerprint",
        W[index][t_ind] = 0
      else:
        y[index][t_ind] = labels[target]
  print "DATASET_TO_NUMPY"
  print "np.shape(X)"
  print np.shape(X)
  if weight_positives:
    W = balance_positives(y, W)
  return (X, y, W)

"""
def dataset_to_numpy(dataset, feature_endpoint="fingerprint",
    labels_endpoint="labels", weight_positives=True):
  '''Transforms a loaded dataset into numpy arrays (X, y).

  Transforms provided dict into feature matrix X (of dimensions [n_samples,
  n_features]) and label matrix y (of dimensions [n_samples,
  n_targets+n_desc]), where n_targets is the number of assays in the
  provided datset and n_desc is the number of computed descriptors we'd
  like to predict.

  Note that this function transforms missing data into negative examples
  (this is relatively safe since the ratio of positive to negative examples
  is on the order 1/100)

  Parameters
  ----------
  dataset: dict 
    A dictionary of type produced by load_datasets. 
  '''
  n_samples = len(dataset.keys())
  sample_datapoint = dataset.itervalues().next()
  n_features = np.size(sample_datapoint[feature_endpoint])
  n_targets = len(sample_datapoint[labels_endpoint])
  X = np.zeros((n_samples, n_features))
  y = np.zeros((n_samples, n_targets))
  W = np.ones((n_samples, n_targets))
  sorted_smiles = sorted(dataset.keys())
  for index, smiles in enumerate(sorted_smiles):
    datapoint = dataset[smiles] 
    fingerprint, labels  = (datapoint[feature_endpoint],
        datapoint[labels_endpoint])
    X[index] = np.array(fingerprint).flatten()
    sorted_targets = sorted(labels.keys())
    # Set labels from measurements
    for t_ind, target in enumerate(sorted_targets):
      if labels[target] == -1:
        y[index][t_ind] = -1
        W[index][t_ind] = 0
      else:
        y[index][t_ind] = labels[target]
  if weight_positives:
    W = balance_positives(y, W)
  return X, y, W
"""

def multitask_to_singletask(dataset):
  """Transforms a multitask dataset to a singletask dataset.