Commit d38f88cc authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Merge pull request #1 from rbharath/master

Merging in Deep-Docking Support
parents 7bf3ea43 c5778d0e
Loading
Loading
Loading
Loading
+54 −87
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@ from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD
from deep_chem.utils.load import load_datasets
from deep_chem.utils.load import ensure_balanced
from deep_chem.utils.preprocess import multitask_to_singletask
from deep_chem.utils.preprocess import train_test_random_split
from deep_chem.utils.preprocess import train_test_scaffold_split
@@ -20,8 +21,8 @@ from deep_chem.utils.evaluate import compute_rms_scores
from deep_chem.utils.evaluate import compute_roc_auc_scores
from deep_chem.utils.load import load_and_transform_dataset

def process_multitask(paths, task_transforms, desc_transforms, splittype="random",
    seed=None, add_descriptors=False, desc_weight=0.5):
def process_multitask(paths, task_transforms, splittype="random",
    seed=None, weight_positives=False):
  """Extracts multitask datasets and splits into train/test.

  Returns a tuple of test/train datasets, fingerprints, and labels.
@@ -35,31 +36,35 @@ def process_multitask(paths, task_transforms, desc_transforms, splittype="random
    List of paths to Google vs datasets. 
  task_transforms: dict 
    dict mapping target names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  desc_transforms: dict
    dict mapping descriptor number to transform. Each transform must be
    either None, "log", "normalize", or "log-normalize"
    None, "log", "normalize" or "log-normalize". Only for regression outputs.
  splittype: string
    Must be "random" or "scaffold"
  seed: int
    Seed used for random splits.
  """
  dataset = load_and_transform_dataset(paths, task_transforms, desc_transforms,
      add_descriptors=add_descriptors)
  dataset = load_and_transform_dataset(paths, task_transforms,
      weight_positives=weight_positives)
  sorted_targets = sorted(dataset.keys())
  if splittype == "random":
    train, test = train_test_random_split(dataset, seed=seed)
  elif splittype == "scaffold":
    train, test = train_test_scaffold_split(dataset)
  else:
    raise ValueError("Improper splittype. Must be random/scaffold.")
  X_train, y_train, W_train = dataset_to_numpy(train,
      add_descriptors=add_descriptors, desc_weight=desc_weight)
  X_test, y_test, W_test = dataset_to_numpy(test,
      add_descriptors=add_descriptors, desc_weight=desc_weight)
  X_train, y_train, W_train = dataset_to_numpy(train)
  ## TODO(rbharath): Still need to fix the failures for PCBA. Temporarily
  ## commenting out to experiment.
  #if weight_positives:
  #  print "Train set balance"
  #  ensure_balanced(y_train, W_train)
  X_test, y_test, W_test = dataset_to_numpy(test)
  #if weight_positives:
  #  print "Test set balance"
  #  ensure_balanced(y_test, W_test)
  return (train, X_train, y_train, W_train, test, X_test, y_test, W_test)

def process_singletask(paths, task_transforms, desc_transforms, splittype="random", seed=None,
    add_descriptors=False, desc_weight=0.5):
def process_singletask(paths, task_transforms, splittype="random", seed=None,
    weight_positives=True):
  """Extracts singletask datasets and splits into train/test.

  Returns a dict that maps target names to tuples.
@@ -76,13 +81,12 @@ def process_singletask(paths, task_transforms, desc_transforms, splittype="rando
  seed: int
    Seed used for random splits.
  """
  dataset = load_and_transform_dataset(paths, task_transforms, desc_transforms,
      add_descriptors=add_descriptors)
  dataset = load_and_transform_dataset(paths, task_transforms,
      weight_positives=weight_positives)
  singletask = multitask_to_singletask(dataset)
  arrays = {}
  for target in singletask:
    data = singletask[target]
    # TODO(rbharath): Remove limitation after debugging.
    if len(data) == 0:
      continue
    if splittype == "random":
@@ -91,18 +95,15 @@ def process_singletask(paths, task_transforms, desc_transforms, splittype="rando
      train, test = train_test_scaffold_split(data)
    else:
      raise ValueError("Improper splittype. Must be random/scaffold.")
    X_train, y_train, W_train = dataset_to_numpy(train,
        add_descriptors=add_descriptors, desc_weight=desc_weight)
    X_test, y_test, W_test = dataset_to_numpy(test,
        add_descriptors=add_descriptors, desc_weight=desc_weight)
    X_train, y_train, W_train = dataset_to_numpy(train)
    X_test, y_test, W_test = dataset_to_numpy(test)
    arrays[target] = (train, X_train, y_train, W_train, test, X_test, y_test,
        W_test)
  return arrays


def fit_multitask_mlp(paths, task_types, task_transforms, desc_transforms,
                      splittype="random", add_descriptors=False, desc_weight=0.5,
                      **training_params):
def fit_multitask_mlp(paths, task_types, task_transforms,
                      splittype="random", weight_positives=False, **training_params):
  """
  Perform stochastic gradient descent optimization for a keras multitask MLP.
  Returns AUCs, R^2 scores, and RMS values.
@@ -116,29 +117,18 @@ def fit_multitask_mlp(paths, task_types, task_transforms, desc_transforms,
    "classification" or "regression".
  task_transforms: dict 
    dict mapping target names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  desc_transforms: dict
    dict mapping descriptor number to transform. Each transform must be
    either None, "log", "normalize", or "log-normalize"
  add_descriptors: bool
    Add descriptor prediction as extra task.
    None, "log", "normalize", or "log-normalize". Only for regression outputs.
  training_params: dict
    Aggregates keyword parameters to pass to train_multitask_model
  """
  (train, X_train, y_train, W_train, test, X_test, y_test, W_test) = (
      process_multitask(paths, task_transforms, desc_transforms,
      splittype=splittype, add_descriptors=add_descriptors, desc_weight=desc_weight))
      process_multitask(paths, task_transforms, splittype=splittype,
      weight_positives=weight_positives))
  print np.shape(y_train)
  model = train_multitask_model(X_train, y_train, W_train, task_types,
                                desc_transforms, add_descriptors=add_descriptors,
                                **training_params)
  results = eval_model(test, model, task_types, desc_transforms,
      add_descriptors=add_descriptors, modeltype="keras_multitask")
  if add_descriptors:
    local_task_types = task_types.copy()
    for desc in desc_transforms:
      local_task_types[desc] = "regression"
  else:
  results = eval_model(test, model, task_types,
      modeltype="keras_multitask")
  local_task_types = task_types.copy()
  aucs = compute_roc_auc_scores(results, local_task_types)
  if aucs:
@@ -146,15 +136,10 @@ def fit_multitask_mlp(paths, task_types, task_transforms, desc_transforms,
  r2s = compute_r2_scores(results, local_task_types)
  if r2s:
    print "Mean R^2: %f" % np.mean(np.array(r2s.values()))
  #rms = compute_rms_scores(results, local_task_types)
  #if rms:
  #  print "Mean RMS: %f" % np.mean(np.array(rms.values()))
  #return (aucs, r2s, rms)

def fit_singletask_mlp(paths, task_types, task_transforms,
                       desc_transforms, splittype="random",
                       add_descriptors=False, desc_weight=0.5,
                       **training_params):
                       splittype="random", weight_positives=True,
                       num_to_train=None, **training_params):
  """
  Perform stochastic gradient descent optimization for a keras MLP.

@@ -166,30 +151,27 @@ def fit_singletask_mlp(paths, task_types, task_transforms,
  task_transforms: dict 
    dict mapping target names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  desc_transforms: dict
    dict mapping descriptor number to transform. Each transform must be
    either None, "log", "normalize", or "log-normalize"
  training_params: dict
    Aggregates keyword parameters to pass to train_multitask_model
  """
  singletasks = process_singletask(paths, task_transforms, desc_transforms,
    splittype=splittype, add_descriptors=add_descriptors,
    desc_weight=desc_weight)
  singletasks = process_singletask(paths, task_transforms,
    splittype=splittype, weight_positives=weight_positives)
  ret_vals = {}
  aucs, r2s, rms = {}, {}, {}
  for index, target in enumerate(singletasks):
  sorted_targets = sorted(singletasks.keys())
  if num_to_train:
    sorted_targets = sorted_targets[:num_to_train]
  for index, target in enumerate(sorted_targets):
    print "Training model %d" % index
    print "Target %s" % target
    (train, X_train, y_train, W_train, test, X_test, y_test, W_test) = (
        singletasks[target])
    model = train_multitask_model(X_train, y_train, W_train,
        {target: task_types[target]}, desc_transforms, add_descriptors=add_descriptors,
        **training_params)
        {target: task_types[target]}, **training_params)
    results = eval_model(test, model, {target: task_types[target]}, 
                         desc_transforms,
                         # We run singletask models as special cases of
                         # multitask.
                         modeltype="keras_multitask",
                         add_descriptors=add_descriptors)
    print "Target %s" % target
                         modeltype="keras_multitask")
    target_aucs = compute_roc_auc_scores(results, task_types)
    target_r2s = compute_r2_scores(results, task_types)
    target_rms = compute_rms_scores(results, task_types)
@@ -198,24 +180,23 @@ def fit_singletask_mlp(paths, task_types, task_transforms,
    r2s.update(target_r2s)
    rms.update(target_rms)
  if aucs:
    print aucs
    print "Mean AUC: %f" % np.mean(np.array(aucs.values()))
  if r2s:
    print r2s
    print "Mean R^2: %f" % np.mean(np.array(r2s.values()))
  if rms:
    print rms
    print "Mean RMS: %f" % np.mean(np.array(rms.values()))

def train_multitask_model(X, y, W, task_types, desc_transforms, add_descriptors=False,
                      learning_rate=0.01, decay=1e-6,
                      momentum=0.9, nesterov=True, activation="relu",
                      dropout=0.5, nb_epoch=20, batch_size=50, n_hidden=500,
                      n_input=1024, validation_split=0.1):
def train_multitask_model(X, y, W, task_types,
  learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
  dropout=0.5, nb_epoch=20, batch_size=50, n_hidden=500, n_input=1024,
  validation_split=0.1):
  """
  Perform stochastic gradient descent optimization for a keras multitask MLP.
  Returns a trained model.

  TODO(rbharath): The handling of add_descriptors for semi-supervised learning
  is horrible. Refactor.

  Parameters
  ----------
  X: np.ndarray
@@ -227,11 +208,6 @@ def train_multitask_model(X, y, W, task_types, desc_transforms, add_descriptors=
  task_types: dict 
    dict mapping target names to output type. Each output type must be either
    "classification" or "regression".
  desc_transforms: dict
    dict mapping descriptor number to transform. Each transform must be
    either None, "log", "normalize", or "log-normalize"
  add_descriptors: bool
    Add descriptor prediction as extra task.
  learning_rate: float
    Learning rate used.
  decay: float
@@ -246,16 +222,8 @@ def train_multitask_model(X, y, W, task_types, desc_transforms, add_descriptors=
  eps = .001
  num_tasks = len(task_types)
  sorted_targets = sorted(task_types.keys())
  if add_descriptors:
    sorted_descriptors = sorted(desc_transforms.keys())
    endpoints = sorted_targets + sorted_descriptors
    local_task_types = task_types.copy()
    for desc in desc_transforms:
      local_task_types[desc] = "regression"
  else:
  local_task_types = task_types.copy()
  endpoints = sorted_targets
  print "endpoints: " + str(endpoints)
  # Add eps weight to avoid minibatches with zero weight (causes theano to crash).
  W = W + eps * np.ones(np.shape(W))
  model = Graph()
@@ -293,7 +261,6 @@ def train_multitask_model(X, y, W, task_types, desc_transforms, add_descriptors=
  model.compile(optimizer=sgd, loss=loss_dict)
  print "Done compiling. About to fit model!"
  print "validation_split: " + str(validation_split)
  print "decay: " + str(decay)
  model.fit(data_dict, nb_epoch=nb_epoch, batch_size=batch_size, validation_split=validation_split,
            sample_weight=sample_weights)
  model.fit(data_dict, nb_epoch=nb_epoch, batch_size=batch_size,
    validation_split=validation_split, sample_weight=sample_weights)
  return model
+12 −10
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@
Code for processing datasets using scikit-learn.
"""
import numpy as np
from deep_chem.utils.analysis import results_to_csv
from deep_chem.utils.load import load_and_transform_dataset
from deep_chem.utils.preprocess import multitask_to_singletask
from deep_chem.utils.preprocess import train_test_random_split
@@ -23,8 +24,7 @@ from sklearn.linear_model import LassoLarsCV
from sklearn.svm import SVR

def fit_singletask_models(paths, modeltype, task_types, task_transforms,
    add_descriptors=False, desc_transforms={}, splittype="random",
    seed=None):
    splittype="random", seed=None, num_to_train=None):
  """Fits singletask linear regression models to potency.

  Parameters
@@ -43,15 +43,15 @@ def fit_singletask_models(paths, modeltype, task_types, task_transforms,
  task_transforms: dict 
    dict mapping target names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  desc_transforms: dict
    dict mapping descriptor number to transform. Each transform must be
    either None, "log", "normalize", or "log-normalize"
  """
  dataset = load_and_transform_dataset(paths, task_transforms, desc_transforms,
      add_descriptors=add_descriptors)
  dataset = load_and_transform_dataset(paths, task_transforms)
  singletask = multitask_to_singletask(dataset)
  aucs, r2s, rms = {}, {}, {}
  for target in singletask:
  sorted_targets = sorted(singletask.keys())
  if num_to_train:
    sorted_targets = sorted_targets[:num_to_train]
  for index, target in enumerate(sorted_targets):
    print "Building model %d" % index
    data = singletask[target]
    if splittype == "random":
      train, test = train_test_random_split(data, seed=seed)
@@ -82,9 +82,8 @@ def fit_singletask_models(paths, modeltype, task_types, task_transforms,
    else:
      raise ValueError("Invalid model type provided.")
    model.fit(X_train, y_train.ravel())
    # TODO(rbharath): This breaks on regression datasets
    results = eval_model(test, model, {target: task_types[target]},
        desc_transforms, modeltype="sklearn", add_descriptors=add_descriptors)
        modeltype="sklearn")

    target_aucs = compute_roc_auc_scores(results, task_types)
    target_r2s = compute_r2_scores(results, task_types)
@@ -94,10 +93,13 @@ def fit_singletask_models(paths, modeltype, task_types, task_transforms,
    r2s.update(target_r2s)
    rms.update(target_rms)
  if aucs:
    print results_to_csv(aucs)
    print "Mean AUC: %f" % np.mean(np.array(aucs.values()))
  if r2s:
    print results_to_csv(r2s)
    print "Mean R^2: %f" % np.mean(np.array(r2s.values()))
  if rms:
    print results_to_csv(rms)
    print "Mean RMS: %f" % np.mean(np.array(rms.values()))


+17 −28
Original line number Diff line number Diff line
@@ -5,30 +5,14 @@ import argparse
import numpy as np
from deep_chem.models.deep import fit_singletask_mlp
from deep_chem.models.deep import fit_multitask_mlp
from deep_chem.models.deep import train_multitask_model
from deep_chem.models.standard import fit_singletask_models
from deep_chem.models.standard import fit_multitask_rf
from deep_chem.utils.analysis import compare_datasets
from deep_chem.utils.evaluate import eval_model
from deep_chem.utils.evaluate import compute_roc_auc_scores
from deep_chem.utils.evaluate import compute_r2_scores
from deep_chem.utils.evaluate import compute_rms_scores
from deep_chem.utils.load import get_target_names
from deep_chem.utils.load import load_datasets
from deep_chem.utils.load import load_and_transform_dataset
from deep_chem.utils.preprocess import dataset_to_numpy
from deep_chem.utils.preprocess import train_test_random_split
from deep_chem.utils.preprocess import train_test_scaffold_split
from deep_chem.utils.preprocess import scaffold_separate
from deep_chem.utils.preprocess import multitask_to_singletask
from deep_chem.utils.load import get_default_task_types_and_transforms
from deep_chem.utils.preprocess import get_default_descriptor_transforms

def parse_args(input_args=None):
  """Parse command-line arguments."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--datasets', required=1, nargs="+",
                      choices=['muv', 'pcba', 'dude', 'pfizer', 'globavir'],
                      choices=['muv', 'pcba', 'dude', 'pfizer', 'globavir', 'pdbbind'],
                      help='Name of dataset to process.')
  parser.add_argument("--paths", required=1, nargs="+",
                      help = "Paths to input datasets.")
@@ -53,6 +37,11 @@ def parse_args(input_args=None):
                  help="Learning rate decay for NN models.")
  parser.add_argument("--validation-split", type=float, default=0.0,
                  help="Percent of training data to use for validation.")
  parser.add_argument("--weight-positives", type=bool, default=False,
                  help="Weight positive examples to have same total weight as negatives.")
  # TODO(rbharath): Remove this once debugging is complete.
  parser.add_argument("--num-to-train", type=int, default=None,
                  help="Number of datasets to train on. Only for debug.")
  return parser.parse_args(input_args)

def main():
@@ -62,24 +51,24 @@ def main():
    paths[dataset] = path

  task_types, task_transforms = get_default_task_types_and_transforms(paths)
  desc_transforms = get_default_descriptor_transforms()

  if args.model == "singletask_deep_network":
    fit_singletask_mlp(paths.values(), task_types, task_transforms,
      desc_transforms, splittype=args.splittype, add_descriptors=False,
      n_hidden=args.n_hidden, learning_rate=args.learning_rate,
      dropout=args.dropout, nb_epoch=args.n_epochs, decay=args.decay,
      batch_size=args.batch_size,
      validation_split=args.validation_split)
      splittype=args.splittype, n_hidden=args.n_hidden,
      learning_rate=args.learning_rate, dropout=args.dropout,
      nb_epoch=args.n_epochs, decay=args.decay, batch_size=args.batch_size,
      validation_split=args.validation_split,
      weight_positives=args.weight_positives, num_to_train=args.num_to_train)
  elif args.model == "multitask_deep_network":
    fit_multitask_mlp(paths.values(), task_types, task_transforms,
      desc_transforms, splittype=args.splittype, add_descriptors=False,
      n_hidden=args.n_hidden, learning_rate = args.learning_rate, dropout = args.dropout,
      batch_size=args.batch_size,
      nb_epoch=args.n_epochs, decay=args.decay, validation_split=args.validation_split)
      splittype=args.splittype, n_hidden=args.n_hidden, learning_rate =
      args.learning_rate, dropout = args.dropout, batch_size=args.batch_size,
      nb_epoch=args.n_epochs, decay=args.decay,
      validation_split=args.validation_split,
      weight_positives=args.weight_positives)
  else:
    fit_singletask_models(paths.values(), args.model, task_types,
        task_transforms, splittype="scaffold")
        task_transforms, splittype=args.splittype, num_to_train=args.num_to_train)

if __name__ == "__main__":
  main()
+55 −25
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@ import pandas as pd
import openpyxl as px
import numpy as np
import argparse
import csv
from rdkit import Chem
import subprocess
from vs_utils.utils import SmilesGenerator
@@ -15,8 +16,13 @@ from vs_utils.utils import SmilesGenerator
def parse_args(input_args=None):
  """Parse command-line arguments."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--xlsx', required=1,
                      help='Excel file with Globavir data.')
  parser.add_argument('--input-file', required=1,
                      help='Input file with data.')
  parser.add_argument("--columns", required=1, nargs="+",
                      help = "Names of columns.")
  parser.add_argument('--column-types', required=1, nargs="+",
                      choices=['string', 'float', 'list', 'float-array'],
                      help='Name of dataset to process.')
  parser.add_argument("--name", required=1,
                      help="Name of the dataset.")
  parser.add_argument("--out", required=1,
@@ -28,7 +34,7 @@ def generate_directories(name, out):
  dataset_dir = os.path.join(out, name)
  if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)
  fingerprint_dir = os.path.join(dataset_dir, "circular-scaffold-smiles")
  fingerprint_dir = os.path.join(dataset_dir, "fingerprints")
  if not os.path.exists(fingerprint_dir):
    os.makedirs(fingerprint_dir)
  target_dir = os.path.join(dataset_dir, "targets")
@@ -58,42 +64,66 @@ def parse_float_input(val):

def generate_fingerprints(name, out):
  dataset_dir = os.path.join(out, name)
  fingerprint_dir = os.path.join(dataset_dir, "circular-scaffold-smiles")
  fingerprint_dir = os.path.join(dataset_dir, "fingerprints")
  shards_dir = os.path.join(dataset_dir, "shards")
  sdf = os.path.join(shards_dir, "%s-0.sdf.gz" % name)
  fingerprints = os.path.join(fingerprint_dir,
      "%s-circular-scaffolds-smiles.pkl.gz" % name)
      "%s-fingerprints.pkl.gz" % name)
  subprocess.call(["python", "-m", "vs_utils.scripts.featurize",
                   "--scaffolds", "--smiles",
                   sdf, fingerprints,
                   "circular", "--size", "1024"])

def globavir_specs():
  columns = ["compound_name", "isomeric_smiles", "tdo_ic50_nm", "tdo_Ki_nm",
    "tdo_percent_activity_10_um", "tdo_percent_activity_1_um", "ido_ic50_nm",
    "ido_Ki_nm", "ido_percent_activity_10_um", "ido_percent_activity_1_um"]
  column_types = ["string", "string", "float", "float", "float", "float",
      "float", "float", "float", "float"]

def generate_targets(xlsx_file, out_pkl, out_sdf):
  """Process Globavir xlsx file."""
  rows, mols = [], []
def gen_xlsx_rows(xlxs_file):
  W = px.load_workbook(xlsx_file, use_iterators=True)
  p = W.get_sheet_by_name(name="Sheet1")
  return p.iter_rows()

def get_xlsx_row_data(row):
  return [cell.internal_value for cell in row]

def gen_csv_rows(csv_file):
  # This is a memory leak...
  f = open(csv_file, "rb")
  return csv.reader(f, delimiter="\t")

def generate_targets(input_file, columns, column_types, out_pkl, out_sdf, type="csv"):
  """Process input data file."""
  rows, mols = [], []
  smiles = SmilesGenerator()
  for row_index, row in enumerate(p.iter_rows()):
  if type == "xlsx":
    row_gen = gen_xlsx_rows(input_file)
  elif type == "csv":
    row_gen = gen_csv_rows(input_file)
  for row_index, raw_row in enumerate(row_gen):
    print row_index
    # Skip row labels.
    if row_index == 0:
      continue
    row_data = [cell.internal_value for cell in row]
    # TODO(rbharath): Generalize this code to work for non-Globavir data. 
    row = {
      "compound_name": row_data[0],
      "isomeric_smiles": row_data[1],
      "tdo_ic50_nm": parse_float_input(row_data[5]),
      "tdo_Ki_nm": parse_float_input(row_data[6]),
      "tdo_percent_activity_10_um": parse_float_input(row_data[7]),
      "tdo_percent_activity_1_um": parse_float_input(row_data[8]),
      "ido_ic50_nm": parse_float_input(row_data[9]),
      "ido_Ki_nm": parse_float_input(row_data[10]),
      "ido_percent_activity_10_um": parse_float_input(row_data[11]),
      "ido_percent_activity_1_um": parse_float_input(row_data[12])
    }
    mol = Chem.MolFromSmiles(row["isomeric_smiles"])
    if type == "xlsx":
      row_data = get_xlsx_row_data(raw_row)
    elif type == "csv":
      row_data = raw_row 
      
    row = {}
    for ind, (column, column_type) in enumerate(zip(columns, column_types)):
      if column_type == "string":
        row[column] = row_data[ind]
      elif column_type == "float":
        row[column] = parse_float_input(row_data[ind])
      elif column_type == "list":
        row[column] = row_data[ind].split(",")
      elif column_type == "float-array":
        row[column] = np.array(row_data[ind].split(","))

    mol = Chem.MolFromSmiles(row["smiles"])
    row["smiles"] = smiles.get_smiles(mol)
    mols.append(mol)
    rows.append(row)
@@ -111,7 +141,7 @@ def generate_targets(xlsx_file, out_pkl, out_sdf):
def main():
  args = parse_args()
  out_pkl, out_sdf = generate_directories(args.name, args.out)
  generate_targets(args.xlsx, out_pkl, out_sdf)
  generate_targets(args.input_file, args.columns, args.column_types, out_pkl, out_sdf)
  generate_fingerprints(args.name, args.out)


+6 −0
Original line number Diff line number Diff line
@@ -7,6 +7,12 @@ __license__ = "LGPL"

import numpy as np

def results_to_csv(results_dict):
  """Pretty prints results as CSV line."""
  targets = sorted(results_dict.keys())
  print ",".join(targets)
  print ",".join([str(results_dict[target]) for target in targets])

def summarize_distribution(y):
  """Analyzes regression dataset.

Loading