Commit 6f4ba45d authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Initial commit of refactored dataset handling

parent b9efcd34
Loading
Loading
Loading
Loading
+7 −3
Original line number Diff line number Diff line
@@ -26,7 +26,8 @@ def fit_multitask_mlp(train_data, task_types, **training_params):
  models = {}
  # Follows convention from process_datasets that the data for multitask models
  # is grouped under key "all"
  (_, X_train, y_train, W_train) = train_data["all"]
  X_train = train_data["features"]
  (y_train, W_train) = train_data["all"]
  models["all"] = train_multitask_model(X_train, y_train, W_train, task_types,
                                **training_params)
  return models
@@ -45,10 +46,13 @@ def fit_singletask_mlp(train_data, task_types, **training_params):
    Aggregates keyword parameters to pass to train_multitask_model
  """
  models = {}
  for index, target in enumerate(sorted(train_data.keys())):
  train_ids = train_data["mol_ids"]
  X_train = train_data["features"]
  sorted_targets = train_data["sorted_targets"]
  for index, target in enumerate(sorted_targets):
    print "Training model %d" % index
    print "Target %s" % target
    (train_ids, X_train, y_train, W_train) = train_data[target]
    (y_train, W_train) = train_data[target]
    print "%d compounds in Train" % len(train_ids)
    models[target] = train_multitask_model(X_train, y_train, W_train,
        {target: task_types[target]}, **training_params)
+7 −3
Original line number Diff line number Diff line
@@ -7,14 +7,18 @@ from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution3D, MaxPooling3D

def fit_3D_convolution(per_task_data, task_types, **training_params):
def fit_3D_convolution(train_data, task_types, **training_params):
  """
  Perform stochastic gradient descent for a 3D CNN.
  """
  models = {}
  (_, X_train, y_train, _) = per_task_data.itervalues().next()
  X_train = train_data["features"]
  if len(train_data["sorted_targets"]) > 1:
    raise ValueError("3D Convolutions only supported for singletask.")
  target_name = train_data["sorted_targets"][0]
  (y_train, _) = train_data["sorted_targets"].itervalues().next()
  nb_classes = 2
  models["all"] = train_3D_convolution(X_train, y_train, **training_params)
  models[target_name] = train_3D_convolution(X_train, y_train, **training_params)
  return models

def train_3D_convolution(X, y, batch_size=50, nb_epoch=1,learning_rate=0.01,
+14 −12
Original line number Diff line number Diff line
@@ -28,9 +28,11 @@ def fit_singletask_models(train_data, modeltype):
    None or "log". Only for regression outputs.
  """
  models = {}
  for target in sorted(train_data.keys()):
  X_train = train_data["features"]
  sorted_targets = train_data["sorted_targets"]
  for target in sorted_targets:
    print "Building model for target %s" % target
    (_, X_train, y_train, _) = train_data[target]
    (y_train, _) = train_data[target]
    if modeltype == "rf_regressor":
      model = RandomForestRegressor(
          n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
@@ -55,13 +57,13 @@ def fit_singletask_models(train_data, modeltype):
    models[target] = model
  return models

# TODO(rbharath): I believe this is broken. Update it to work with the rest of
# the package.
def fit_multitask_rf(train_data):
  """Fits a multitask RF model to provided dataset.
  """
  (_, X_train, y_train, _) = train_data
  model = RandomForestClassifier(
      n_estimators=100, n_jobs=-1, class_weight="auto")
  model.fit(X_train, y_train)
  return model
## TODO(rbharath): I believe this is broken. Update it to work with the rest of
## the package.
#def fit_multitask_rf(train_data):
#  """Fits a multitask RF model to provided dataset.
#  """
#  (_, X_train, y_train, _) = train_data
#  model = RandomForestClassifier(
#      n_estimators=100, n_jobs=-1, class_weight="auto")
#  model.fit(X_train, y_train)
#  return model
+0 −1
Original line number Diff line number Diff line
@@ -397,7 +397,6 @@ def _train_test_input(paths, output_transforms, input_transforms,
                      feature_types, splittype, weight_positives, mode,
                      train_out, test_out, target_names):
  """Saves transformed model."""
  #targets = get_target_names(paths)
  if output_transforms == "" or output_transforms == "None":
    output_transforms = []
  else:
+40 −28
Original line number Diff line number Diff line
@@ -22,7 +22,12 @@ def process_datasets(paths, input_transforms, output_transforms,
    splittype="random", seed=None, weight_positives=True, target_names=[]):
  """Extracts datasets and split into train/test.

  Returns a dict that maps target names to tuples.
  Returns a dict with the following keys
  
  "features" -> X
  "mol_ids"  -> ids
  target -> (y, W) 
  "sorted_targets" -> sorted_targets

  Parameters
  ----------
@@ -37,30 +42,36 @@ def process_datasets(paths, input_transforms, output_transforms,
    Seed used for random splits.
  """
  dataset = load_datasets(paths, feature_types=feature_types, target_names=target_names)
  train_dict, test_dict = {}, {}
  if mode == "singletask":
    singletask = multitask_to_singletask(dataset)
    print("Completed multitask_to_singletask")
    for task in singletask:
      print(task)
      print("About to split dataset")
      data = singletask[task]
      if len(data) == 0:
        continue
      print("About to split train and test")
  train, test = split_dataset(dataset, splittype)
      print("Done spliting train and test")
      train_dict[task], test_dict[task] = to_arrays(train, test)
      print("to_arrays is done")
  elif mode == "multitask":
    train, test = split_dataset(dataset, splittype)
    train_data, test_data = to_arrays(train, test)
    train_dict["all"], test_dict["all"] = train_data, test_data
  else:
    raise ValueError("Unsupported mode for process_datasets.")
  target = train_dict.itervalues().next()
  print "Shape of Xtrain"
  print np.shape(target[1])
  train_dict = standardize(train, mode=mode)
  test_dict = standardize(test, mode=mode)
  #if mode == "singletask":
  #  # Perform common train/test split across all tasks
  #  #train_features, train_labels = multitask_to_singletask(train)
  #  #test_features, test_labels = multitask_to_singletask(test)
  #  #train_dict["features"], train_dict["labels"] = train_features, train_labels
  #  #test_dict["features"], test_dict["labels"] = test_features, test_labels
  #  #print("Completed multitask_to_singletask")
  #  #for task in singletask:
  #  #  print(task)
  #  #  print("About to split dataset")
  #  #  data = singletask[task]
  #  #  if len(data) == 0:
  #  #    continue
  #  #  print("About to split train and test")
  #  #  train, test = split_dataset(dataset, splittype)
  #  #  print("Done spliting train and test")
  #  #  train_dict[task], test_dict[task] = to_arrays(train, test)
  #  #  print("to_arrays is done")
  #elif mode == "multitask":
  #  train, test = split_dataset(dataset, splittype)
  #  train_data, test_data = to_arrays(train, test)
  #  train_dict["all"], test_dict["all"] = train_data, test_data
  #else:
  #  raise ValueError("Unsupported mode for process_datasets.")
  #target = train_dict.itervalues().next()
  #print "Shape of Xtrain"
  #print np.shape(target[1])
  return train_dict, test_dict 

def load_molecules(paths, feature_types=["fingerprints"]):
@@ -196,9 +207,10 @@ def transform_data(data, input_transforms, output_transforms):
    transformations. Only for regression outputs.
  """
  trans_dict = {}
  X = transform_inputs(train_dict["features"], input_transforms)
  trans_dict["mol_ids"], trans_dict["features"] = train_dict["mol_ids"], X
  for target in data:
    ids, X, y, W = data[target]
    y, W = data[target]
    y = transform_outputs(y, W, output_transforms)
    X = transform_inputs(X, input_transforms)
    trans_dict[target] = (ids, X, y, W)
    trans_dict[target] = (y, W)
  return trans_dict
Loading