Commit 50e9071d authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Next batch of changes required for 3D CNNs

parent 697edab1
Loading
Loading
Loading
Loading
+2 −3
Original line number Diff line number Diff line
@@ -97,8 +97,7 @@ def process_singletask(paths, task_transforms, splittype="random", seed=None,
      raise ValueError("Improper splittype. Must be random/scaffold.")
    X_train, y_train, W_train = dataset_to_numpy(train)
    X_test, y_test, W_test = dataset_to_numpy(test)
    arrays[target] = (train, X_train, y_train, W_train, test, X_test, y_test,
        W_test)
    arrays[target] = (train, X_train, y_train, W_train), (test, X_test, y_test, W_test)
  return arrays


@@ -121,7 +120,7 @@ def fit_multitask_mlp(paths, task_types, task_transforms,
  training_params: dict
    Aggregates keyword parameters to pass to train_multitask_model
  """
  (train, X_train, y_train, W_train, test, X_test, y_test, W_test) = (
  (train, X_train, y_train, W_train), (test, X_test, y_test, W_test) = (
      process_multitask(paths, task_transforms, splittype=splittype,
      weight_positives=weight_positives))
  print np.shape(y_train)
+21 −16
Original line number Diff line number Diff line
"""
Code for training 3D convolutions.
"""
from deep_chem.datasets.shapes_3d import load_data
import numpy as np
from keras.optimizers import RMSprop
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution3D, MaxPooling3D
from keras.utils import np_utils
import numpy as np
from deep_chem.utils.preprocess import train_test_random_split
from deep_chem.utils.load import load_and_transform_dataset
from deep_chem.utils.preprocess import tensor_dataset_to_numpy
from deep_chem.datasets.shapes_3d import load_data

# TODO(rbharath): Factor this out into a separate function in utils. Duplicates
# code in deep.py
def process_3D_convolutions(paths, task_transforms, splittype="random"):
def process_3D_convolutions(paths, task_transforms, seed=None, splittype="random"):
  """Loads 3D Convolution datasets.

  Parameters
@@ -19,24 +22,26 @@ def process_3D_convolutions(paths, task_transforms, splittype="random"):
  paths: list
    List of paths to convolution datasets.
  """
  dataset = load_and_transform_dataset(paths, task_transforms)
  dataset = load_and_transform_dataset(paths, task_transforms, datatype="pdbbind")
  # TODO(rbharath): Factor this code splitting out into a util function.
  if splittype == "random":
    train, test = train_test_random_split(dataset, seed=seed)
  elif splittype == "scaffold":
    train, test = train_test_scaffold_split(dataset)
  X_train, y_train, W_train = dataset_to_numpy(train)
  X_test, y_test, W_test = dataset_to_numpy(test)
  X_train, y_train, W_train = tensor_dataset_to_numpy(train)
  X_test, y_test, W_test = tensor_dataset_to_numpy(test)
  return (X_train, y_train, W_train), (X_test, y_test, W_test)

def fit_3D_convolution(axis_length=32, **training_params):
def fit_3D_convolution(paths, task_types, task_transforms, axis_length=32, **training_params):
  """
  Perform stochastic gradient descent for a 3D CNN.
  """
  # TODO(rbharath): task_types is not yet used below.
  (X_train, y_train, W_train), (X_test, y_test, W_test) = process_3D_convolutions(
    paths, task_transforms)
  nb_classes = 2
  (X_train, y_train), (X_test, y_test) = load_data(axis_length=axis_length)
  y_train = np_utils.to_categorical(y_train, nb_classes)
  y_test = np_utils.to_categorical(y_test, nb_classes)
  print "np.shape(y_train)"
  print np.shape(y_train)
  print "np.shape(X_train): " + str(np.shape(X_train))
  print "np.shape(y_train): " + str(np.shape(y_train))
  train_3D_convolution(X_train, y_train, axis_length, **training_params)
@@ -66,9 +71,10 @@ def train_3D_convolution(X, y, axis_length=32, batch_size=50, nb_epoch=1):
  nb_conv = [7, 5, 3]

  model = Sequential()
  model.add(Convolution3D(nb_filter=nb_filters[0], stack_size=1,
  # TODO(rbharath): Avoid hard coding the number of stacks here
  model.add(Convolution3D(nb_filter=nb_filters[0], stack_size=3,
     nb_row=nb_conv[0], nb_col=nb_conv[0], nb_depth=nb_conv[0],
     border_mode='valid'))
     border_mode='valid', input_shape=(32, 32, 32, 3)))
  model.add(Activation('relu'))
  model.add(MaxPooling3D(poolsize=(nb_pool[0], nb_pool[0], nb_pool[0])))
  model.add(Convolution3D(nb_filter=nb_filters[1], stack_size=nb_filters[0],
@@ -85,11 +91,10 @@ def train_3D_convolution(X, y, axis_length=32, batch_size=50, nb_epoch=1):
  model.add(Dense(320, 32/2, init='normal'))
  model.add(Activation('relu'))
  model.add(Dropout(0.5))
  model.add(Dense(32/2, nb_classes, init='normal'))
  model.add(Activation('softmax'))
  # TODO(rbharath): Generalize this to support classification as well as regression.
  model.add(Dense(32/2, 1, init='normal'))

  sgd = RMSprop(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
  model.compile(loss='categorical_crossentropy', optimizer=sgd)
  model.compile(loss='mean_squared_error', optimizer=sgd)
  model.fit(X, y, batch_size=batch_size, nb_epoch=nb_epoch)

  return model
+2 −1
Original line number Diff line number Diff line
@@ -146,7 +146,8 @@ def generate_targets(input_file, input_type, columns, column_types, out_pkl,
    # TODO(rbharath): This patch is only in place until the smiles/sequence
    # support is fixed.
    if row["smiles"] is None:
      mol = Chem.MolFromSmiles("C")
      # This multiplication kludge guarantees unique smiles.
      mol = Chem.MolFromSmiles("C"*row_index)
    else:
      mol = Chem.MolFromSmiles(row["smiles"])
    row["smiles"] = smiles.get_smiles(mol)
+49 −18
Original line number Diff line number Diff line
@@ -77,6 +77,10 @@ def load_molecules(paths, dir_name="fingerprints"):
  Returns a dictionary that maps smiles strings to dicts that contain
  fingerprints, smiles strings, scaffolds, mol_ids.

  TODO(rbharath): This function assumes that all datapoints are uniquely keyed
  by smiles strings. This doesn't hold true for the pdbbind dataset. Need to find
  a more general indexing mechanism.

  Parameters
  ----------
  paths: list
@@ -100,6 +104,29 @@ def load_molecules(paths, dir_name="fingerprints"):
                                    "mol_id": mol_ids[mol]}
  return molecules 

def load_pdbbind_molecules(paths, dir_name="fingerprints"):
  """Load dataset fingerprints and return fingerprints.
  """
  # TODO(rbharath): This is a total kludge. Clean up later.
  dir_name = "targets"
  molecules = {}
  for dataset_path in paths:
    pickle_dir = os.path.join(dataset_path, dir_name)
    pickle_files = os.listdir(pickle_dir)
    if len(pickle_files) == 0:
      raise ValueError("No Pickle Files found to load molecules")
    for pickle_file in pickle_files:
      with gzip.open(os.path.join(pickle_dir, pickle_file), "rb") as f:
        contents = pickle.load(f)
        smiles, fingerprints, scaffolds, mol_ids = (
            contents["smiles"], contents["features"],
            None, None)
        for mol in range(len(contents["smiles"])):
          molecules[smiles[mol]] = {"fingerprint": fingerprints[mol],
                                    "scaffold": None,
                                    "mol_id": None}
  return molecules 

def get_target_names(paths, target_dir_name="targets"):
  """Get names of targets in provided collections.

@@ -121,7 +148,7 @@ def load_assays(paths, target_dir_name="targets"):

  Returns a dictionary that maps smiles strings to label vectors.

  TODO(rbharath): Simplify this function to only support the new pickle format.
  TODO(rbharath): Remove the use of smiles as unique identifier

  Parameters
  ----------
@@ -181,27 +208,30 @@ def load_datasets(paths, datatype="vs", **load_args):
  else:
    raise ValueError("Unsupported datatype.")

def load_pdbbind_datasets(pdbbind_paths):
def load_pdbbind_datasets(paths, target_dir_name="targets",
    fingerprint_dir_name="fingerprints"):
  """Load pdbbind datasets.

  TODO(rbharath): This uses smiles as unique identifier. FIX BEFORE RELEASE!

  Parameters
  ----------
  pdbbind_path: list 
    List of Pdbbind data files.
  """
  data = []
  for pdbbind_path in pdbbind_paths:
    with open(pdbbind_path, "rb") as csvfile:
      reader = csv.reader(csvfile)
      for row_ind, row in enumerate(reader):
        if row_ind == 0:
  data = {}
  molecules = load_pdbbind_molecules(paths)
  labels = load_assays(paths, target_dir_name)
  # TODO(rbharath): Why are there fewer descriptors than labels at times?
  # What accounts for the descrepency. Please investigate.
  for ind, smiles in enumerate(molecules):
    if smiles not in labels:
      continue
        data.append({
          "label": row[0],
          "features": row[1],
        })
  df = pd.DataFrame(data)
  return df
    mol = molecules[smiles]
    data[ind] = {"fingerprint": mol["fingerprint"],
                 "scaffold": mol["scaffold"],
                 "labels": labels[smiles]}
  return data

def load_vs_datasets(paths, target_dir_name="targets",
    fingerprint_dir_name="fingerprints"):
@@ -242,7 +272,8 @@ def ensure_balanced(y, W):
    assert np.isclose(pos_weight, neg_weight)

def load_and_transform_dataset(paths, task_transforms,
    labels_endpoint="labels", weight_positives=True):
    labels_endpoint="labels", weight_positives=True,
    datatype="vs"):
  """Transform data labels as specified

  Parameters
@@ -255,9 +286,9 @@ def load_and_transform_dataset(paths, task_transforms,
    are performed in the order specified. An empty list corresponds to no
    transformations. Only for regression outputs.
  """
  dataset = load_datasets(paths)
  dataset = load_datasets(paths, datatype=datatype)
  X, y, W = transform_outputs(dataset, task_transforms,
      weight_positives=weight_positives)
      weight_positives=weight_positives, datatype=datatype)
  ## TODO(rbharath): Take this out once test passes
  #if weight_positives:
  #  ensure_balanced(y, W)
+30 −2
Original line number Diff line number Diff line
@@ -9,7 +9,8 @@ import numpy as np
import warnings
from deep_chem.utils.analysis import summarize_distribution

def transform_outputs(dataset, task_transforms, weight_positives=True):
def transform_outputs(dataset, task_transforms, weight_positives=True,
    datatype="pdbbind"):
  """Tranform the provided outputs

  Parameters
@@ -22,7 +23,10 @@ def transform_outputs(dataset, task_transforms, weight_positives=True):
    performed in the order specified. An empty list
    corresponds to no transformations. Only for regression outputs.
  """
  if datatype == "vs":
    X, y, W = dataset_to_numpy(dataset, weight_positives=weight_positives)
  elif datatype == "pdbbind":
    X, y, W = tensor_dataset_to_numpy(dataset)
  sorted_targets = sorted(task_transforms.keys())
  endpoints = sorted_targets
  transforms = task_transforms.copy()
@@ -109,6 +113,30 @@ def balance_positives(y, W):
    W[negative_inds, target_ind] = 1
  return W

def tensor_dataset_to_numpy(dataset, feature_endpoint="fingerprint",
    labels_endpoint="labels"):
  """Transforms a set of tensor data into numpy arrays (X, y)"""
  print "Entering tensor_dataset_to_numpy"
  n_samples = len(dataset.keys())
  sample_datapoint = dataset.itervalues().next()
  feature_shape = np.shape(sample_datapoint[feature_endpoint])
  n_targets = 1 # TODO(rbharath): Generalize this later
  X = np.zeros((n_samples,) + feature_shape)
  y = np.zeros((n_samples, n_targets))
  W = np.ones((n_samples, n_targets))
  print "np.shape(X), np.shape(y), np.shape(W)"
  print np.shape(X), np.shape(y), np.shape(W)
  sorted_ids = sorted(dataset.keys())
  for index, smiles in enumerate(dataset.keys()):
    datapoint = dataset[smiles]
    fingerprint, labels = (datapoint[feature_endpoint],
      datapoint[labels_endpoint])
    X[index] = fingerprint
    # TODO(rbharath): The label is a dict for some reason?!? Figure this out
    # and fix it.
    y[index] = labels["3d_core_pdbbind"]
  return (X, y, W)

def dataset_to_numpy(dataset, feature_endpoint="fingerprint",
    labels_endpoint="labels", weight_positives=True):
  """Transforms a loaded dataset into numpy arrays (X, y).