Commit d8c0d194 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

First step of Model refactor

parent 395bbd84
Loading
Loading
Loading
Loading
+17 −11
Original line number Diff line number Diff line
@@ -114,7 +114,8 @@ class Dataset(object):
  Wrapper class for dataset transformed into X, y, w numpy ndarrays.
  """
  def __init__(self, data_dir=None, tasks=[], metadata_rows=None, #featurizers=None, 
               raw_data=None, verbosity=None, reload=False):
               raw_data=None, verbosity=None, reload=False,
               compute_feature_statistics=True):
    """
    Turns featurized dataframes into numpy files, writes them & metadata to disk.
    """
@@ -132,7 +133,9 @@ class Dataset(object):
        metadata_rows = []
        ids, X, y, w = raw_data
        metadata_rows.append(
            Dataset.write_data_to_disk(self.data_dir, "data", tasks, X, y, w, ids))
            Dataset.write_data_to_disk(
                self.data_dir, "data", tasks, X, y, w, ids,
                compute_feature_statistics=compute_feature_statistics))
        self.metadata_df = Dataset.construct_metadata(metadata_rows)
        self.save_to_disk()
      else:
@@ -153,7 +156,7 @@ class Dataset(object):
  @staticmethod
  def write_dataframe(val, data_dir, featurizer=None, tasks=None,
                      raw_data=None, basename=None, mol_id_field="mol_id",
                      verbosity=None):
                      verbosity=None, compute_feature_statistics=None):
    """Writes data from dataframe to disk."""
    if featurizer is not None and tasks is not None:
      feature_type = featurizer.__class__.__name__
@@ -161,6 +164,7 @@ class Dataset(object):
      # TODO(rbharath): This is a hack. clean up.
      if not len(df):
        return None
      if compute_feature_statistics is None:
        if hasattr(featurizer, "dtype"):
          dtype = featurizer.dtype
          compute_feature_statistics = False
@@ -386,7 +390,8 @@ class Dataset(object):
    self.save_to_disk()

  @staticmethod
  def from_numpy(data_dir, X, y, w=None, ids=None, tasks=None, verbosity=None):
  def from_numpy(data_dir, X, y, w=None, ids=None, tasks=None, verbosity=None,
                 compute_feature_statistics=True):
    n_samples = len(X)
    # The -1 indicates that y will be reshaped to have length -1
    if n_samples > 0:
@@ -402,7 +407,8 @@ class Dataset(object):
      tasks = np.arange(n_tasks)
    raw_data = (ids, X, y, w)
    return Dataset(data_dir=data_dir, tasks=tasks, raw_data=raw_data,
                   verbosity=verbosity)
                   verbosity=verbosity,
                   compute_feature_statistics=compute_feature_statistics)

  @staticmethod
  def merge(merge_dir, datasets):
+24 −29
Original line number Diff line number Diff line
@@ -27,19 +27,24 @@ class Model(object):
  """
  Abstract base class for different ML models.
  """
  def __init__(self, tasks, task_types, model_params, model_dir, fit_transformers=None,
               model_instance=None, initialize_raw_model=True, 
               verbosity=None, **kwargs):
    self.model_class = model_instance.__class__
    self.model_dir = model_dir
  def __init__(self, model_instance, n_tasks, model_dir,
               fit_transformers=None, verbosity=None):
    """Abstract class for all models.
    Parameters:
    -----------
    model_instance: object
      Wrapper around ScikitLearn/Keras/Tensorflow model object.
    n_tasks: int
      Number of tasks for this model.
    """
    if not os.path.exists(self.model_dir):
      os.makedirs(self.model_dir)
    self.tasks = tasks
    self.task_types = task_types
    self.model_params = model_params
    self.model_instance = model_instance
    self.model_class = model_instance.__class__
    self.model_dir = model_dir
    self.n_tasks = n_tasks
    self.fit_transformers = fit_transformers

    self.raw_model = None
    assert verbosity in [None, "low", "high"]
    self.verbosity = verbosity

@@ -64,18 +69,6 @@ class Model(object):
    raise NotImplementedError(
        "Each model is responsible for its own predict_on_batch method.")

  def set_raw_model(self, raw_model):
    """
    Set underlying raw model. Useful when loading from disk.
    """
    self.raw_model = raw_model

  def get_raw_model(self):
    """
    Return raw model.
    """
    return self.raw_model

  def reload(self):
    """
    Reload trained model from disk.
@@ -100,7 +93,6 @@ class Model(object):
  def save(self):
    """Dispatcher function for saving."""
    params = {"model_params" : self.model_params,
              "task_types" : self.task_types,
              "model_class": self.__class__}
    save_to_disk(params, Model.get_params_filename(self.model_dir))

@@ -151,7 +143,7 @@ class Model(object):
    """
    y_preds = []
    batch_size = self.model_params["batch_size"]
    n_tasks = len(self.tasks)
    n_tasks = self.n_tasks
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(
        batch_size, deterministic=True):
      n_samples = len(X_batch)
@@ -162,7 +154,7 @@ class Model(object):
  
    # The iterbatches does padding with zero-weight examples on the last batch.
    # Remove padded examples.
    n_samples, n_tasks = len(dataset), len(self.tasks)
    n_samples = len(dataset)
    y_pred = np.reshape(y_pred, (n_samples, n_tasks))
    # Special case to handle singletasks.
    if n_tasks == 1:
@@ -215,7 +207,7 @@ class Model(object):
    y_pred = np.vstack(y_preds)
    y = np.vstack(y_train)

    n_samples, n_tasks = len(dataset), len(self.tasks)
    n_samples, n_tasks = len(dataset), self.n_tasks
    n_atoms = int((n_tasks-1)/3)

    y_pred = np.reshape(y_pred, (n_samples, n_tasks)) 
@@ -272,7 +264,7 @@ class Model(object):
    y = np.vstack(y_train)
    grad = np.vstack(grads)

    n_samples, n_tasks = len(dataset), len(self.tasks)
    n_samples, n_tasks = len(dataset), self.n_tasks
    n_atoms = int((n_tasks-1)/3)

    y_pred = np.reshape(y_pred, (n_samples, n_tasks)) 
@@ -362,7 +354,7 @@ class Model(object):
    """
    y_preds = []
    batch_size = self.model_params["batch_size"]
    n_tasks = len(self.tasks)
    n_tasks = self.n_tasks
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(
        batch_size, deterministic=True):
      y_pred_batch = self.predict_proba_on_batch(X_batch)
@@ -373,7 +365,7 @@ class Model(object):
    y_pred = np.vstack(y_preds)
    # The iterbatches does padding with zero-weight examples on the last batch.
    # Remove padded examples.
    n_samples, n_tasks = len(dataset), len(self.tasks)
    n_samples = len(dataset)
    y_pred = y_pred[:n_samples]
    y_pred = np.reshape(y_pred, (n_samples, n_tasks, n_classes))
    return y_pred
@@ -382,6 +374,9 @@ class Model(object):
    """
    Currently models can only be classifiers or regressors.
    """
    ################################################################## DEBUG
    # TODO(rbharath): This is a hack based on fact that multi-tasktype models
    # aren't supported.
    return self.task_types.itervalues().next()
    #return self.task_types.itervalues().next()
    raise NotImplementedError
    ################################################################## DEBUG
+2 −3
Original line number Diff line number Diff line
@@ -24,7 +24,7 @@ class KerasModel(Model):
    Saves underlying keras model to disk.
    """
    super(KerasModel, self).save()
    model = self.get_raw_model()
    model = self.model_instance
    filename, _ = os.path.splitext(Model.get_model_filename(self.model_dir))

    # Note that keras requires the model architecture and weights to be stored
@@ -52,5 +52,4 @@ class KerasModel(Model):
    with open(json_filename) as file_obj:
      model = model_from_json(file_obj.read())
    model.load_weights(h5_filename)
    self.raw_model = model
    self.model_instnace = model
+89 −86
Original line number Diff line number Diff line
@@ -15,88 +15,98 @@ from keras.optimizers import SGD
from deepchem.models.keras_models import KerasModel
from deepchem.metrics import to_one_hot

class MultiTaskDNN(KerasModel):
class MultiTaskDNN(Graph):
  """
  Model for multitask MLP in keras.
  
  TODO(rbharath): Port this code over to use Keras's new functional-API
  instead of using legacy Graph object.
  """
  def __init__(self, tasks, task_types, model_params, model_dir,
               fit_transformers=None,
               initialize_raw_model=True, verbosity="low"):
    super(MultiTaskDNN, self).__init__(tasks, task_types, model_params,
                                       model_dir,
                                       fit_transformers=fit_transformers,
                                       initialize_raw_model=initialize_raw_model,
                                       verbosity=verbosity)
    if initialize_raw_model:
      (n_inputs,) = model_params["data_shape"]
      model = Graph()
      model.add_input(name="input", input_shape=(n_inputs,))
  def __init__(self, n_tasks, n_inputs, task_type, nb_layers=1, nb_hidden=1000,
               init="glorot_uniform", batchnorm=False, dropout=0.5,
               activation="relu", learning_rate=.001, decay=1e-6,
               momentum=0.9, nesterov=False, verbosity="low"):
    super(MultiTaskDNN, self).__init__()
    # Store hyperparameters
    assert task_type in ["classification", "regression"]
    self.task_type = task_type
    self.n_inputs = n_inputs
    self.nb_layers = nb_layers
    self.nb_hidden = nb_hidden
    self.init = init
    self.batchnorm = batchnorm
    self.dropout = dropout
    self.activation = activation
    self.learning_rate = learning_rate
    self.decay = decay
    self.momentum = momentum
    self.nesterov = nesterov

    self.add_input(name="input", input_shape=(self.n_inputs,))
    prev_layer = "input"
      for ind, layer in enumerate(range(model_params["nb_layers"])):
    for ind, layer in enumerate(range(self.nb_layers)):
      dense_layer_name = "dense%d" % ind
      activation_layer_name = "activation%d" % ind
      batchnorm_layer_name = "batchnorm%d" % ind
      dropout_layer_name = "dropout%d" % ind
        model.add_node(
            Dense(model_params["nb_hidden"], init=model_params["init"]),
      self.add_node(
          Dense(self.nb_hidden, init=self.init),
          name=dense_layer_name, input=prev_layer)
      prev_layer = dense_layer_name 
        if model_params["batchnorm"]:
          model.add_node(
      if self.batchnorm:
        self.add_node(
          BatchNormalization(), input=prev_layer, name=batchnorm_layer_name)
        prev_layer = batchnorm_layer_name
        model.add_node(Activation(model_params["activation"]),
      self.add_node(Activation(self.activation),
                    name=activation_layer_name, input=prev_layer)
      prev_layer = activation_layer_name
        if model_params["dropout"] > 0:
          model.add_node(Dropout(model_params["dropout"]),
      if self.dropout > 0:
        self.add_node(Dropout(self.dropout),
                      name=dropout_layer_name,
                      input=prev_layer)
        prev_layer = dropout_layer_name
      for ind, task in enumerate(self.tasks):
        task_type = task_types[task]
        if task_type == "classification":
          model.add_node(
              Dense(2, init=model_params["init"], activation="softmax"),
              name="dense_head%d" % ind, input=prev_layer)
        elif task_type == "regression":
          model.add_node(
              Dense(1, init=model_params["init"]),
              name="dense_head%d" % ind, input=prev_layer)
        model.add_output(name="task%d" % ind, input="dense_head%d" % ind)
    for task in range(self.n_tasks):
      if self.task_type == "classification":
        self.add_node(
            Dense(2, init=self.init, activation="softmax"),
            name="dense_head%d" % task, input=prev_layer)
      elif self.task_type == "regression":
        self.add_node(
            Dense(1, init=self.init),
            name="dense_head%d" % task, input=prev_layer)
      self.add_output(name="task%d" % task, input="dense_head%d" % task)

    loss_dict = {}
      for ind, task in enumerate(self.tasks):
        task_type, taskname = task_types[task], "task%d" % ind
        if task_type == "classification":
    for task in range(self.n_tasks):
      taskname = "task%d" % ind
      if self.task_type == "classification":
        loss_dict[taskname] = "binary_crossentropy"
        elif task_type == "regression":
      elif self.task_type == "regression":
        loss_dict[taskname] = "mean_squared_error"
      sgd = SGD(lr=model_params["learning_rate"],
                decay=model_params["decay"],
                momentum=model_params["momentum"],
                nesterov=model_params["nesterov"])
      model.compile(optimizer=sgd, loss=loss_dict)
      self.raw_model = model
    sgd = SGD(lr=self.learning_rate,
              decay=self.decay,
              momentum=self.momentum,
              nesterov=self.nesterov)
    self.compile(optimizer=sgd, loss=loss_dict)

  def get_data_dict(self, X, y=None):
    """Wrap data X in dict for graph computations (Keras graph only for now)."""
    data = {}
    data["input"] = X
    for ind, task in enumerate(self.tasks):
      task_type, taskname = self.task_types[task], "task%d" % ind
    for task in range(self.n_tasks):
      taskname = "task%d" % task 
      if y is not None:
        if task_type == "classification":
          data[taskname] = to_one_hot(y[:, ind])
        elif task_type == "regression":
          data[taskname] = y[:, ind]
        if self.task_type == "classification":
          data[taskname] = to_one_hot(y[:, task])
        elif self.task_type == "regression":
          data[taskname] = y[:, task]
    return data

  def get_sample_weight(self, w):
    """Get dictionaries needed to fit models"""
    sample_weight = {}
    for ind in range(len(self.tasks)):
      sample_weight["task%d" % ind] = w[:, ind]
    for task in range(self.n_tasks):
      sample_weight["task%d" % task] = w[:, task]
    return sample_weight

  def fit_on_batch(self, X, y, w):
@@ -118,18 +128,16 @@ class MultiTaskDNN(KerasModel):
    data = self.get_data_dict(X)
    y_pred_dict = self.raw_model.predict_on_batch(data)
    nb_samples = np.shape(X)[0]
    nb_tasks = len(self.tasks)
    y_pred = np.zeros((nb_samples, nb_tasks))
    for ind, task in enumerate(self.tasks):
      task_type = self.task_types[task]
      taskname = "task%d" % ind
      if task_type == "classification":
    y_pred = np.zeros((nb_samples, self.n_tasks))
    for task in range(self.n_tasks):
      taskname = "task%d" % task 
      if self.task_type == "classification":
        # Class probabilities are predicted for classification outputs. Instead,
        # output the most likely class.
        y_pred_task = np.squeeze(np.argmax(y_pred_dict[taskname], axis=1))
      else:
        y_pred_task = np.squeeze(y_pred_dict[taskname])
      y_pred[:, ind] = y_pred_task
      y_pred[:, task] = y_pred_task
    y_pred = np.squeeze(y_pred)
    return y_pred

@@ -140,13 +148,11 @@ class MultiTaskDNN(KerasModel):
    data = self.get_data_dict(X)
    y_pred_dict = self.raw_model.predict_on_batch(data)
    n_samples = np.shape(X)[0]
    n_tasks = len(self.tasks)
    y_pred = np.zeros((n_samples, n_tasks, n_classes))
    for ind, task in enumerate(self.tasks):
      task_type = self.task_types[task]
      taskname = "task%d" % ind
    y_pred = np.zeros((n_samples, self.n_tasks, n_classes))
    for task in rand(self.n_tasks):
      taskname = "task%d" % task 
      y_pred_task = np.squeeze(y_pred_dict[taskname])
      y_pred[:, ind] = y_pred_task
      y_pred[:, task] = y_pred_task
    y_pred = np.squeeze(y_pred)
    return y_pred

@@ -154,10 +160,7 @@ class SingleTaskDNN(MultiTaskDNN):
  """
  Abstract base class for different ML models.
  """
  def __init__(self, tasks, task_types, model_params, model_dir,
               fit_transformers=None, initialize_raw_model=True,
               verbosity="low"):
  def __init__(self, n_inputs, task_type, **kwargs):
    n_tasks = 1
    super(SingleTaskDNN, self).__init__(
        tasks, task_types, model_params, model_dir,
        fit_transformers=fit_transformers,
        initialize_raw_model=initialize_raw_model, verbosity=verbosity)
        n_tasks, n_inputs, task_type, **kwargs)
+0 −2
Original line number Diff line number Diff line
@@ -28,8 +28,6 @@ base_data_dir = "/scratch/users/rbharath/muv"

muv_tasks, dataset, transformers = load_muv(
    base_data_dir, reload=reload)
print("len(dataset)")
print(len(dataset))

base_dir = "/scratch/users/rbharath/muv_analysis"
if os.path.exists(base_dir):