Commit 5616ef8d authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #213 from rbharath/singletask_dnn

Singletask DNN Support
parents b652d440 a6a537b2
Loading
Loading
Loading
Loading
+21 −12
Original line number Diff line number Diff line
@@ -214,24 +214,33 @@ class Dataset(object):
          os.path.join(self.data_dir, row['ids'])), dtype=object)
      yield (X, y, w, ids)

  def iterbatches(self, batch_size=None, epoch=0):
    """
    Returns minibatches from dataset.
    """
    for i, (X, y, w, ids) in enumerate(self.itershards()):
      nb_sample = np.shape(X)[0]
  def iterbatches(self, batch_size=None, epoch=0, deterministic=False):
    """Returns minibatches from dataset randomly."""
    num_shards = self.get_number_shards()
    if not deterministic:
      shard_perm = np.random.permutation(num_shards)
    else:
      shard_perm = np.arange(num_shards)
    for i in range(num_shards):
      X, y, w, ids = self.get_shard(shard_perm[i])
      n_samples = X.shape[0]
      if not deterministic:
        sample_perm = np.random.permutation(n_samples)
      else:
        sample_perm = np.arange(n_samples)
      if batch_size is None:
        shard_batch_size = nb_sample
        shard_batch_size = n_samples
      else:
        shard_batch_size = batch_size 
      interval_points = np.linspace(
          0, nb_sample, np.ceil(float(nb_sample)/shard_batch_size)+1, dtype=int)
          0, n_samples, np.ceil(float(n_samples)/shard_batch_size)+1, dtype=int)
      for j in range(len(interval_points)-1):
        indices = range(interval_points[j], interval_points[j+1])
        X_batch = X[indices, :]
        y_batch = y[indices]
        w_batch = w[indices]
        ids_batch = ids[indices]
        perm_indices = sample_perm[indices]
        X_batch = X[perm_indices, :]
        y_batch = y[perm_indices]
        w_batch = w[perm_indices]
        ids_batch = ids[perm_indices]
        yield (X_batch, y_batch, w_batch, ids_batch)

  def reshard(self, shard_size):
+9 −6
Original line number Diff line number Diff line
@@ -223,12 +223,14 @@ class DataLoader(object):
  def _featurize_shard(self, df_shard, write_fn, shard_num, input_type):
    """Featurizes a shard of an input dataframe."""
    field = self.mol_field if input_type == "sdf" else self.smiles_field 
    field_type = "mol" if input_type == "sdf" else "smiles" 
    log("Currently featurizing feature_type: %s"
        % self.featurizer.__class__.__name__, self.verbosity)
    if isinstance(self.featurizer, UserDefinedFeaturizer):
      self._add_user_specified_features(df_shard, self.featurizer)
    elif isinstance(self.featurizer, Featurizer):
      self._featurize_mol(df_shard, self.featurizer, field=field)
      self._featurize_mol(df_shard, self.featurizer, field=field,
                          field_type=field_type)
    elif isinstance(self.featurizer, ComplexFeaturizer):
      self._featurize_complexes(df_shard, self.featurizer)
    basename = "shard-%d" % shard_num 
@@ -273,8 +275,8 @@ class DataLoader(object):
                                      zip(ligand_pdbs, protein_pdbs))
    df[featurizer.__class__.__name__] = list(features)

  def _featurize_mol(self, df, featurizer, parallel=True, field="mol",
                     worker_pool=None):    
  def _featurize_mol(self, df, featurizer, parallel=True, field_type="mol",
                     field=None, worker_pool=None):    
    """Featurize individual compounds.

       Given a featurizer that operates on individual chemical compounds 
@@ -289,13 +291,14 @@ class DataLoader(object):

      TODO(rbharath): Needs to be merged with _featurize_compounds
    """
    assert field in ["mol", "smiles"]
    assert field_type in ["mol", "smiles"]
    assert field is not None
    sample_elems = df[field].tolist()

    if worker_pool is None:
      features = []
      for ind, elem in enumerate(sample_elems):
        if field == "smiles":
        if field_type == "smiles":
          mol = Chem.MolFromSmiles(elem)
        else:
          mol = elem
@@ -305,7 +308,7 @@ class DataLoader(object):
    else:
      def featurize_wrapper(elem, dilled_featurizer):
        print("Featurizing %s" % elem)
        if field == "smiles":
        if field_type == "smiles":
          mol = Chem.MolFromSmiles(smiles)
        else:
          mol = elem
+4 −2
Original line number Diff line number Diff line
@@ -137,7 +137,8 @@ class Model(object):
    """
    y_preds = []
    batch_size = self.model_params["batch_size"]
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size):
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(
        batch_size, deterministic=True):
      y_pred_batch = np.reshape(self.predict_on_batch(X_batch), y_batch.shape)
      y_pred_batch = undo_transforms(y_pred_batch, transformers)
      y_preds.append(y_pred_batch)
@@ -159,7 +160,8 @@ class Model(object):
    y_preds = []
    batch_size = self.model_params["batch_size"]
    n_tasks = len(self.tasks)
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size):
    for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(
        batch_size, deterministic=True):
      y_pred_batch = self.predict_proba_on_batch(X_batch)
      batch_size = len(y_batch)
      y_pred_batch = np.reshape(y_pred_batch, (batch_size, n_tasks, n_classes))
+24 −20
Original line number Diff line number Diff line
@@ -18,15 +18,18 @@ class SingletaskToMultitask(Model):
  Warning: This current implementation is only functional for sklearn models. 
  """
  def __init__(self, tasks, task_types, model_params, model_dir, model_builder,
               verbosity=None):
               store_in_memory=False, verbosity=None):
    self.tasks = tasks
    self.task_types = task_types
    self.model_params = model_params
    self.models = {}
    self.model_dir = model_dir
    # If models are TF models, they don't use up RAM, so can keep in memory
    self.task_models = {}
    self.task_model_dirs = {}
    self.model_builder = model_builder
    self.verbosity = verbosity
    self.store_in_memory = store_in_memory
    log("About to initialize singletask to multitask model",
        self.verbosity, "high")
    if not os.path.exists(self.model_dir):
@@ -65,23 +68,18 @@ class SingletaskToMultitask(Model):

    Warning: This current implementation is only functional for sklearn models. 
    """
    log("About to create task-specific datasets", self.verbosity, "high")
    task_datasets = self._create_task_datasets(dataset)
    for ind, task in enumerate(self.tasks):
      log("Fitting model for task %s" % task, self.verbosity, "high")
      X_task, y_task, w_task, ids_task = task_datasets[ind].to_numpy()
      task_model = self.model_builder(
          [task], {task: self.task_types[task]}, self.model_params,
          self.task_model_dirs[task],
          verbosity=self.verbosity)
      if y_task.size > 0:
        task_model.raw_model.fit(X_task, np.ravel(y_task))
      else:
        print("No labels for task %s" % task)
        print("Fitting on dummy dataset.")
        X_task_fake = np.zeros_like(X)
        y_task_fake = np.zeros_like(w_task)
        task_model.raw_model.fit(X_task_fake, y_task_fake)
      task_model.fit(task_datasets[ind])
      task_model.save()
      if self.store_in_memory:
        self.task_models[task] = task_model

  def predict_on_batch(self, X):
    """
@@ -92,6 +90,9 @@ class SingletaskToMultitask(Model):
    y_pred = np.zeros((n_samples, n_tasks))
    for ind, task in enumerate(self.tasks):
      task_type = self.task_types[task]
      if self.store_in_memory:
        task_model = self.task_models[task]
      else:
        task_model = self.model_builder(
            [task], {task: self.task_types[task]}, self.model_params,
            self.task_model_dirs[task],
@@ -114,6 +115,9 @@ class SingletaskToMultitask(Model):
    n_samples = X.shape[0]
    y_pred = np.zeros((n_samples, n_tasks, n_classes))
    for ind, task in enumerate(self.tasks):
      if self.store_in_memory:
        task_model = self.task_models[task]
      else:
        task_model = self.model_builder(
            [task], {task: self.task_types[task]}, self.model_params,
            self.task_model_dirs[task],
+2 −2
Original line number Diff line number Diff line
@@ -180,7 +180,7 @@ class TensorflowGraph(object):
      else:
        self.updates = tf.no_op(name='updates')

  def fit(self, dataset, shuffle=True, max_checkpoints_to_keep=5):
  def fit(self, dataset, shuffle=False, max_checkpoints_to_keep=5):
    """Fit the model.

    Args:
@@ -558,7 +558,7 @@ class TensorflowModel(Model):
    self.num_tasks = len(self.task_types)
    self.fit_transformers = None

  def fit(self, dataset, shuffle=True):
  def fit(self, dataset, shuffle=False):
    """
    Fits TensorflowGraph to data.
    """