Commit 59417eba authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #295 from rbharath/progressive

Initial Implementation of Progressive Networks
parents 1e08ab95 3389acea
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -138,7 +138,7 @@ class DataLoader(object):

  def featurize(self, input_files, data_dir=None, shard_size=8192,
                num_shards_per_batch=24, worker_pool=None,
                logging=True, debug=False):
                logging=True, debug=True):
    """Featurize provided files and write to specified location."""
    ############################################################## TIMING
    time1 = time.time()
+0 −36
Original line number Diff line number Diff line
@@ -15,7 +15,6 @@ import tempfile
import shutil
import numpy as np
import tensorflow as tf
from keras import backend as K
import deepchem as dc
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
@@ -100,41 +99,6 @@ class TestHyperparamOptAPI(unittest.TestCase):
      params_dict, train_dataset, valid_dataset, transformers,
      classification_metric, logdir=None)

  def test_multitask_keras_mlp_ECFP_classification_hyperparam_opt(self):
    """Straightforward test of Keras multitask deepchem classification API."""
    task_type = "classification"
    current_dir = os.path.dirname(os.path.abspath(__file__))
    input_file = os.path.join(
        current_dir, "../../models/tests/multitask_example.csv")
    tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
             "task7", "task8", "task9", "task10", "task11", "task12",
             "task13", "task14", "task15", "task16"]

    n_features = 1024
    featurizer = dc.feat.CircularFingerprint(size=n_features)
    loader = dc.load.DataLoader(
        tasks=tasks, smiles_field="smiles",
        featurizer=featurizer, verbosity="low")
    dataset = loader.featurize(input_file)

    splitter = dc.splits.ScaffoldSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset)

    transformers = []
    metric = dc.metrics.Metric(
        dc.metrics.matthews_corrcoef, np.mean, mode="classification")
    params_dict= {"n_hidden": [5, 10]}
      
    def model_builder(model_params, model_dir):
      keras_model = dc.models.MultiTaskDNN(
          len(tasks), n_features, task_type, dropout=0., **model_params)
      return dc.models.KerasModel(keras_model, model_dir)
    optimizer = dc.hyper.HyperparamOpt(model_builder)
    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, transformers,
      metric, logdir=None)

  def test_multitask_tf_mlp_ECFP_classification_hyperparam_opt(self):
    """Straightforward test of Tensorflow multitask deepchem classification API."""
    task_type = "classification"
+1 −5
Original line number Diff line number Diff line
@@ -7,7 +7,6 @@ from __future__ import unicode_literals

from deepchem.models.models import Model
from deepchem.models.sklearn_models import SklearnModel
from deepchem.models.keras_models import KerasModel
from deepchem.models.tf_keras_models.multitask_classifier import MultitaskGraphClassifier
from deepchem.models.tf_keras_models.support_classifier import SupportGraphClassifier
from deepchem.models.multitask import SingletaskToMultitask
@@ -17,7 +16,4 @@ from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskClassifie
from deepchem.models.tensorflow_models.robust_multitask import RobustMultitaskRegressor
from deepchem.models.tensorflow_models.robust_multitask import RobustMultitaskClassifier
from deepchem.models.tensorflow_models.lr import TensorflowLogisticRegression

# TODO(rbharath): I'm not sure if this model should be exposed. Not in
# benchmark suite for example.
from deepchem.models.keras_models.fcnet import MultiTaskDNN
from deepchem.models.tensorflow_models.progressive_multitask import ProgressiveMultitaskRegressor
+99 −99
Original line number Diff line number Diff line
@@ -7,102 +7,102 @@ from __future__ import unicode_literals

import os
import numpy as np
from keras.models import Graph
from keras.models import load_model
from keras.models import model_from_json
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization 
from keras.optimizers import SGD
from deepchem.models import Model

class KerasModel(Model):
  """
  Abstract base class shared across all Keras models.
  """

  def save(self):
    """
    Saves underlying keras model to disk.
    """
    model = self.model_instance
    filename, _ = os.path.splitext(Model.get_model_filename(self.model_dir))

    ## Note that keras requires the model architecture and weights to be stored
    ## separately. A json file is generated that specifies the model architecture.
    ## The weights will be stored in an h5 file. The pkl.gz file with store the
    ## target name.
    json_filename = "%s.%s" % (filename, "json")
    h5_filename = "%s.%s" % (filename, "h5")
    self.model_instance.save(h5_filename)
    # Save architecture
    json_string = model.to_json()
    with open(json_filename, "w") as file_obj:
      file_obj.write(json_string)
    model.save_weights(h5_filename, overwrite=True)

  def reload(self, custom_objects={}):
    """
    Load keras multitask DNN from disk.
    """
    filename = Model.get_model_filename(self.model_dir)
    filename, _ = os.path.splitext(filename)

    json_filename = "%s.%s" % (filename, "json")
    h5_filename = "%s.%s" % (filename, "h5")

    with open(json_filename) as file_obj:
      model = model_from_json(file_obj.read(), custom_objects=custom_objects)
    model.load_weights(h5_filename)
    self.model_instance = model

  def predict_on_batch(self, X, pad_batch=False):
    """
    Makes predictions on given batch of new data.

    Parameters
    ----------
    X: np.ndarray
      Features
    pad_batch: bool, optional
      Used for Tensorflow models with rigid batch-size requirements.
    """
    n_samples = len(X) 
    n_tasks = self.get_num_tasks()
    if pad_batch:
      X = pad_features(self.batch_size, X)
    y_pred = self.model_instance.predict_on_batch(X)
    y_pred = np.reshape(y_pred, (n_samples, n_tasks))
    return y_pred

  # TODO(rbharath): The methods below aren't extensible and depend on
  # implementation details of fcnet. Better way to expose this information?
  def fit_on_batch(self, X, y, w):
    """Fit model on batch of data."""
    return self.model_instance.fit_on_batch(X, y, w)

  def get_num_tasks(self):
    return self.model_instance.n_tasks
  
  def predict_proba_on_batch(self, X, pad_batch=False, n_classes=2):
    """
    Makes predictions of class probabilities on given batch of new data.

    Parameters
    ----------
    X: np.ndarray
      Features
    pad_batch: bool, optional
      Ignored for Sklearn Model. Only used for Tensorflow models
      with rigid batch-size requirements.
    n_classes: int
      Number of classifier classes
    """
    n_samples = len(X) 
    n_tasks = self.get_num_tasks()
    
    if pad_batch:
      X = pad_features(self.batch_size, X)
    y_pred_proba = self.model_instance.predict_proba_on_batch(X,
        n_classes)
    y_pred_proba = np.reshape(y_pred_proba, (n_samples, n_tasks, n_classes))
    return y_pred_proba
#from keras.models import Graph
#from keras.models import load_model
#from keras.models import model_from_json
#from keras.layers.core import Dense, Dropout, Activation
#from keras.layers.normalization import BatchNormalization 
#from keras.optimizers import SGD
#from deepchem.models import Model
#
#class KerasModel(Model):
#  """
#  Abstract base class shared across all Keras models.
#  """
#
#  def save(self):
#    """
#    Saves underlying keras model to disk.
#    """
#    model = self.model_instance
#    filename, _ = os.path.splitext(Model.get_model_filename(self.model_dir))
#
#    ## Note that keras requires the model architecture and weights to be stored
#    ## separately. A json file is generated that specifies the model architecture.
#    ## The weights will be stored in an h5 file. The pkl.gz file with store the
#    ## target name.
#    json_filename = "%s.%s" % (filename, "json")
#    h5_filename = "%s.%s" % (filename, "h5")
#    self.model_instance.save(h5_filename)
#    # Save architecture
#    json_string = model.to_json()
#    with open(json_filename, "w") as file_obj:
#      file_obj.write(json_string)
#    model.save_weights(h5_filename, overwrite=True)
#
#  def reload(self, custom_objects={}):
#    """
#    Load keras multitask DNN from disk.
#    """
#    filename = Model.get_model_filename(self.model_dir)
#    filename, _ = os.path.splitext(filename)
#
#    json_filename = "%s.%s" % (filename, "json")
#    h5_filename = "%s.%s" % (filename, "h5")
#
#    with open(json_filename) as file_obj:
#      model = model_from_json(file_obj.read(), custom_objects=custom_objects)
#    model.load_weights(h5_filename)
#    self.model_instance = model
#
#  def predict_on_batch(self, X, pad_batch=False):
#    """
#    Makes predictions on given batch of new data.
#
#    Parameters
#    ----------
#    X: np.ndarray
#      Features
#    pad_batch: bool, optional
#      Used for Tensorflow models with rigid batch-size requirements.
#    """
#    n_samples = len(X) 
#    n_tasks = self.get_num_tasks()
#    if pad_batch:
#      X = pad_features(self.batch_size, X)
#    y_pred = self.model_instance.predict_on_batch(X)
#    y_pred = np.reshape(y_pred, (n_samples, n_tasks))
#    return y_pred
#
#  # TODO(rbharath): The methods below aren't extensible and depend on
#  # implementation details of fcnet. Better way to expose this information?
#  def fit_on_batch(self, X, y, w):
#    """Fit model on batch of data."""
#    return self.model_instance.fit_on_batch(X, y, w)
#
#  def get_num_tasks(self):
#    return self.model_instance.n_tasks
#  
#  def predict_proba_on_batch(self, X, pad_batch=False, n_classes=2):
#    """
#    Makes predictions of class probabilities on given batch of new data.
#
#    Parameters
#    ----------
#    X: np.ndarray
#      Features
#    pad_batch: bool, optional
#      Ignored for Sklearn Model. Only used for Tensorflow models
#      with rigid batch-size requirements.
#    n_classes: int
#      Number of classifier classes
#    """
#    n_samples = len(X) 
#    n_tasks = self.get_num_tasks()
#    
#    if pad_batch:
#      X = pad_features(self.batch_size, X)
#    y_pred_proba = self.model_instance.predict_proba_on_batch(X,
#        n_classes)
#    y_pred_proba = np.reshape(y_pred_proba, (n_samples, n_tasks, n_classes))
#    return y_pred_proba
+51 −14
Original line number Diff line number Diff line
@@ -108,10 +108,13 @@ class TensorflowGraphModel(Model):
  def __init__(self, n_tasks, n_features, logdir=None, layer_sizes=[1000],
               weight_init_stddevs=[.02], bias_init_consts=[1.], penalty=0.0,
               penalty_type="l2", dropouts=[0.5], learning_rate=.001,
               momentum=".9", optimizer="adam", batch_size=50, n_classes=2,
               train=True, verbosity=None, seed=None, **kwargs):
               momentum=.9, optimizer="adam", batch_size=50, n_classes=2,
               verbosity="high", seed=None, **kwargs):
    """Constructs the computational graph.

    This function constructs the computational graph for the model. It relies
    subclassed methods (build/cost) to construct specific graphs.

    Parameters
    ----------
    n_tasks: int
@@ -120,9 +123,34 @@ class TensorflowGraphModel(Model):
      Number of features.
    logdir: str
      Location to save data

    This function constructs the computational graph for the model. It relies
    subclassed methods (build/cost) to construct specific graphs.
    layer_sizes: list
      List of layer sizes.
    weight_init_stddevs: list
      List of standard deviations for weights (sampled from zero-mean
      gaussians). One for each layer.
    bias_init_consts: list
      List of bias initializations. One for each layer.
    penalty: float
      Amount of penalty (l2 or l1 applied)
    penalty_type: str
      Either "l2" or "l1"
    dropouts: list
      List of dropout amounts. One for each layer.
    learning_rate: float
      Learning rate for model.
    momentum: float
      Momentum. Only applied if optimizer=="momentum"
    optimizer: str
      Type of optimizer applied.
    batch_size: int
      Size of minibatches for training.
    n_classes: int
      Number of classes if this is for classification.
      TODO(rbharath): Move this argument to TensorflowClassifier
    verbosity: str
      Must be one of ['high', 'low', None]. Amount of logging to do.
    seed: int
      If not none, is used as random seed for tensorflow. 
    """
    # Save hyperparameters
    self.n_tasks = n_tasks
@@ -138,7 +166,6 @@ class TensorflowGraphModel(Model):
    self.optimizer = optimizer
    self.batch_size = batch_size
    self.n_classes = n_classes
    self.train = train
    self.verbosity = verbosity
    self.seed = seed
    
@@ -247,13 +274,24 @@ class TensorflowGraphModel(Model):
          max_checkpoints_to_keep=5, log_every_N_batches=50, **kwargs):
    """Fit the model.

    Args:
      dataset: Dataset object that represents data on disk.
      max_checkpoints_to_keep: Integer. Maximum number of checkpoints to keep;
        older checkpoints will be deleted.

    Raises:
      AssertionError: If model is not in training mode.
    Parameters
    ---------- 
    dataset: dc.data.Dataset
      Dataset object holding training data 
    nb_epoch: 10
      Number of training epochs.
    pad_batches: bool
      Whether or not to pad each batch to exactly be of size batch_size.
    max_checkpoints_to_keep: int
      Maximum number of checkpoints to keep; older checkpoints will be deleted.
    log_every_N_batches: int
      Report every N batches. Useful for training on very large datasets,
      where epochs can take long time to finish.

    Raises
    ------
    AssertionError
      If model is not in training mode.
    """
    ############################################################## TIMING
    time1 = time.time()
@@ -509,7 +547,6 @@ class TensorflowClassifier(TensorflowGraphModel):
      # run eval data through the model
      n_tasks = self.n_tasks
      output = []
      start = time.time()
      with self._get_shared_session(train=False).as_default():
        feed_dict = self.construct_feed_dict(X)
        data = self._get_shared_session(train=False).run(
Loading