Merge pull request #295 from rbharath/progressive (59417eba) · Commits · 钟慕尧 / deepchem

deepchem/feat/featurize.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -138,7 +138,7 @@ class DataLoader(object):

		def featurize(self, input_files, data_dir=None, shard_size=8192,
		num_shards_per_batch=24, worker_pool=None,
		logging=True, debug=False):
		logging=True, debug=True):
		"""Featurize provided files and write to specified location."""
		############################################################## TIMING
		time1 = time.time()

deepchem/hyper/tests/test_hyperparam_opt.py

+0 −36

Original line number	Diff line number	Diff line
		@@ -15,7 +15,6 @@ import tempfile
		import shutil
		import numpy as np
		import tensorflow as tf
		from keras import backend as K
		import deepchem as dc
		from sklearn.ensemble import RandomForestClassifier
		from sklearn.ensemble import RandomForestRegressor
		@@ -100,41 +99,6 @@ class TestHyperparamOptAPI(unittest.TestCase):
		params_dict, train_dataset, valid_dataset, transformers,
		classification_metric, logdir=None)

		def test_multitask_keras_mlp_ECFP_classification_hyperparam_opt(self):
		"""Straightforward test of Keras multitask deepchem classification API."""
		task_type = "classification"
		current_dir = os.path.dirname(os.path.abspath(__file__))
		input_file = os.path.join(
		current_dir, "../../models/tests/multitask_example.csv")
		tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
		"task7", "task8", "task9", "task10", "task11", "task12",
		"task13", "task14", "task15", "task16"]

		n_features = 1024
		featurizer = dc.feat.CircularFingerprint(size=n_features)
		loader = dc.load.DataLoader(
		tasks=tasks, smiles_field="smiles",
		featurizer=featurizer, verbosity="low")
		dataset = loader.featurize(input_file)

		splitter = dc.splits.ScaffoldSplitter()
		train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
		dataset)

		transformers = []
		metric = dc.metrics.Metric(
		dc.metrics.matthews_corrcoef, np.mean, mode="classification")
		params_dict= {"n_hidden": [5, 10]}

		def model_builder(model_params, model_dir):
		keras_model = dc.models.MultiTaskDNN(
		len(tasks), n_features, task_type, dropout=0., **model_params)
		return dc.models.KerasModel(keras_model, model_dir)
		optimizer = dc.hyper.HyperparamOpt(model_builder)
		best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
		params_dict, train_dataset, valid_dataset, transformers,
		metric, logdir=None)

		def test_multitask_tf_mlp_ECFP_classification_hyperparam_opt(self):
		"""Straightforward test of Tensorflow multitask deepchem classification API."""
		task_type = "classification"

deepchem/models/init.py

+1 −5

Original line number	Diff line number	Diff line
		@@ -7,7 +7,6 @@ from __future__ import unicode_literals

		from deepchem.models.models import Model
		from deepchem.models.sklearn_models import SklearnModel
		from deepchem.models.keras_models import KerasModel
		from deepchem.models.tf_keras_models.multitask_classifier import MultitaskGraphClassifier
		from deepchem.models.tf_keras_models.support_classifier import SupportGraphClassifier
		from deepchem.models.multitask import SingletaskToMultitask
		@@ -17,7 +16,4 @@ from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskClassifie
		from deepchem.models.tensorflow_models.robust_multitask import RobustMultitaskRegressor
		from deepchem.models.tensorflow_models.robust_multitask import RobustMultitaskClassifier
		from deepchem.models.tensorflow_models.lr import TensorflowLogisticRegression

		# TODO(rbharath): I'm not sure if this model should be exposed. Not in
		# benchmark suite for example.
		from deepchem.models.keras_models.fcnet import MultiTaskDNN
		from deepchem.models.tensorflow_models.progressive_multitask import ProgressiveMultitaskRegressor

deepchem/models/keras_models/init.py

+99 −99

Original line number	Diff line number	Diff line
		@@ -7,102 +7,102 @@ from __future__ import unicode_literals

		import os
		import numpy as np
		from keras.models import Graph
		from keras.models import load_model
		from keras.models import model_from_json
		from keras.layers.core import Dense, Dropout, Activation
		from keras.layers.normalization import BatchNormalization
		from keras.optimizers import SGD
		from deepchem.models import Model

		class KerasModel(Model):
		"""
		Abstract base class shared across all Keras models.
		"""

		def save(self):
		"""
		Saves underlying keras model to disk.
		"""
		model = self.model_instance
		filename, _ = os.path.splitext(Model.get_model_filename(self.model_dir))

		## Note that keras requires the model architecture and weights to be stored
		## separately. A json file is generated that specifies the model architecture.
		## The weights will be stored in an h5 file. The pkl.gz file with store the
		## target name.
		json_filename = "%s.%s" % (filename, "json")
		h5_filename = "%s.%s" % (filename, "h5")
		self.model_instance.save(h5_filename)
		# Save architecture
		json_string = model.to_json()
		with open(json_filename, "w") as file_obj:
		file_obj.write(json_string)
		model.save_weights(h5_filename, overwrite=True)

		def reload(self, custom_objects={}):
		"""
		Load keras multitask DNN from disk.
		"""
		filename = Model.get_model_filename(self.model_dir)
		filename, _ = os.path.splitext(filename)

		json_filename = "%s.%s" % (filename, "json")
		h5_filename = "%s.%s" % (filename, "h5")

		with open(json_filename) as file_obj:
		model = model_from_json(file_obj.read(), custom_objects=custom_objects)
		model.load_weights(h5_filename)
		self.model_instance = model

		def predict_on_batch(self, X, pad_batch=False):
		"""
		Makes predictions on given batch of new data.

		Parameters
		----------
		X: np.ndarray
		Features
		pad_batch: bool, optional
		Used for Tensorflow models with rigid batch-size requirements.
		"""
		n_samples = len(X)
		n_tasks = self.get_num_tasks()
		if pad_batch:
		X = pad_features(self.batch_size, X)
		y_pred = self.model_instance.predict_on_batch(X)
		y_pred = np.reshape(y_pred, (n_samples, n_tasks))
		return y_pred

		# TODO(rbharath): The methods below aren't extensible and depend on
		# implementation details of fcnet. Better way to expose this information?
		def fit_on_batch(self, X, y, w):
		"""Fit model on batch of data."""
		return self.model_instance.fit_on_batch(X, y, w)

		def get_num_tasks(self):
		return self.model_instance.n_tasks

		def predict_proba_on_batch(self, X, pad_batch=False, n_classes=2):
		"""
		Makes predictions of class probabilities on given batch of new data.

		Parameters
		----------
		X: np.ndarray
		Features
		pad_batch: bool, optional
		Ignored for Sklearn Model. Only used for Tensorflow models
		with rigid batch-size requirements.
		n_classes: int
		Number of classifier classes
		"""
		n_samples = len(X)
		n_tasks = self.get_num_tasks()

		if pad_batch:
		X = pad_features(self.batch_size, X)
		y_pred_proba = self.model_instance.predict_proba_on_batch(X,
		n_classes)
		y_pred_proba = np.reshape(y_pred_proba, (n_samples, n_tasks, n_classes))
		return y_pred_proba
		#from keras.models import Graph
		#from keras.models import load_model
		#from keras.models import model_from_json
		#from keras.layers.core import Dense, Dropout, Activation
		#from keras.layers.normalization import BatchNormalization
		#from keras.optimizers import SGD
		#from deepchem.models import Model
		#
		#class KerasModel(Model):
		# """
		# Abstract base class shared across all Keras models.
		# """
		#
		# def save(self):
		# """
		# Saves underlying keras model to disk.
		# """
		# model = self.model_instance
		# filename, _ = os.path.splitext(Model.get_model_filename(self.model_dir))
		#
		# ## Note that keras requires the model architecture and weights to be stored
		# ## separately. A json file is generated that specifies the model architecture.
		# ## The weights will be stored in an h5 file. The pkl.gz file with store the
		# ## target name.
		# json_filename = "%s.%s" % (filename, "json")
		# h5_filename = "%s.%s" % (filename, "h5")
		# self.model_instance.save(h5_filename)
		# # Save architecture
		# json_string = model.to_json()
		# with open(json_filename, "w") as file_obj:
		# file_obj.write(json_string)
		# model.save_weights(h5_filename, overwrite=True)
		#
		# def reload(self, custom_objects={}):
		# """
		# Load keras multitask DNN from disk.
		# """
		# filename = Model.get_model_filename(self.model_dir)
		# filename, _ = os.path.splitext(filename)
		#
		# json_filename = "%s.%s" % (filename, "json")
		# h5_filename = "%s.%s" % (filename, "h5")
		#
		# with open(json_filename) as file_obj:
		# model = model_from_json(file_obj.read(), custom_objects=custom_objects)
		# model.load_weights(h5_filename)
		# self.model_instance = model
		#
		# def predict_on_batch(self, X, pad_batch=False):
		# """
		# Makes predictions on given batch of new data.
		#
		# Parameters
		# ----------
		# X: np.ndarray
		# Features
		# pad_batch: bool, optional
		# Used for Tensorflow models with rigid batch-size requirements.
		# """
		# n_samples = len(X)
		# n_tasks = self.get_num_tasks()
		# if pad_batch:
		# X = pad_features(self.batch_size, X)
		# y_pred = self.model_instance.predict_on_batch(X)
		# y_pred = np.reshape(y_pred, (n_samples, n_tasks))
		# return y_pred
		#
		# # TODO(rbharath): The methods below aren't extensible and depend on
		# # implementation details of fcnet. Better way to expose this information?
		# def fit_on_batch(self, X, y, w):
		# """Fit model on batch of data."""
		# return self.model_instance.fit_on_batch(X, y, w)
		#
		# def get_num_tasks(self):
		# return self.model_instance.n_tasks
		#
		# def predict_proba_on_batch(self, X, pad_batch=False, n_classes=2):
		# """
		# Makes predictions of class probabilities on given batch of new data.
		#
		# Parameters
		# ----------
		# X: np.ndarray
		# Features
		# pad_batch: bool, optional
		# Ignored for Sklearn Model. Only used for Tensorflow models
		# with rigid batch-size requirements.
		# n_classes: int
		# Number of classifier classes
		# """
		# n_samples = len(X)
		# n_tasks = self.get_num_tasks()
		#
		# if pad_batch:
		# X = pad_features(self.batch_size, X)
		# y_pred_proba = self.model_instance.predict_proba_on_batch(X,
		# n_classes)
		# y_pred_proba = np.reshape(y_pred_proba, (n_samples, n_tasks, n_classes))
		# return y_pred_proba

deepchem/models/tensorflow_models/init.py

+51 −14

Original line number	Diff line number	Diff line
		@@ -108,10 +108,13 @@ class TensorflowGraphModel(Model):
		def __init__(self, n_tasks, n_features, logdir=None, layer_sizes=[1000],
		weight_init_stddevs=[.02], bias_init_consts=[1.], penalty=0.0,
		penalty_type="l2", dropouts=[0.5], learning_rate=.001,
		momentum=".9", optimizer="adam", batch_size=50, n_classes=2,
		train=True, verbosity=None, seed=None, **kwargs):
		momentum=.9, optimizer="adam", batch_size=50, n_classes=2,
		verbosity="high", seed=None, **kwargs):
		"""Constructs the computational graph.

		This function constructs the computational graph for the model. It relies
		subclassed methods (build/cost) to construct specific graphs.

		Parameters
		----------
		n_tasks: int
		@@ -120,9 +123,34 @@ class TensorflowGraphModel(Model):
		Number of features.
		logdir: str
		Location to save data

		This function constructs the computational graph for the model. It relies
		subclassed methods (build/cost) to construct specific graphs.
		layer_sizes: list
		List of layer sizes.
		weight_init_stddevs: list
		List of standard deviations for weights (sampled from zero-mean
		gaussians). One for each layer.
		bias_init_consts: list
		List of bias initializations. One for each layer.
		penalty: float
		Amount of penalty (l2 or l1 applied)
		penalty_type: str
		Either "l2" or "l1"
		dropouts: list
		List of dropout amounts. One for each layer.
		learning_rate: float
		Learning rate for model.
		momentum: float
		Momentum. Only applied if optimizer=="momentum"
		optimizer: str
		Type of optimizer applied.
		batch_size: int
		Size of minibatches for training.
		n_classes: int
		Number of classes if this is for classification.
		TODO(rbharath): Move this argument to TensorflowClassifier
		verbosity: str
		Must be one of ['high', 'low', None]. Amount of logging to do.
		seed: int
		If not none, is used as random seed for tensorflow.
		"""
		# Save hyperparameters
		self.n_tasks = n_tasks
		@@ -138,7 +166,6 @@ class TensorflowGraphModel(Model):
		self.optimizer = optimizer
		self.batch_size = batch_size
		self.n_classes = n_classes
		self.train = train
		self.verbosity = verbosity
		self.seed = seed

		@@ -247,13 +274,24 @@ class TensorflowGraphModel(Model):
		max_checkpoints_to_keep=5, log_every_N_batches=50, **kwargs):
		"""Fit the model.

		Args:
		dataset: Dataset object that represents data on disk.
		max_checkpoints_to_keep: Integer. Maximum number of checkpoints to keep;
		older checkpoints will be deleted.

		Raises:
		AssertionError: If model is not in training mode.
		Parameters
		----------
		dataset: dc.data.Dataset
		Dataset object holding training data
		nb_epoch: 10
		Number of training epochs.
		pad_batches: bool
		Whether or not to pad each batch to exactly be of size batch_size.
		max_checkpoints_to_keep: int
		Maximum number of checkpoints to keep; older checkpoints will be deleted.
		log_every_N_batches: int
		Report every N batches. Useful for training on very large datasets,
		where epochs can take long time to finish.

		Raises
		------
		AssertionError
		If model is not in training mode.
		"""
		############################################################## TIMING
		time1 = time.time()
		@@ -509,7 +547,6 @@ class TensorflowClassifier(TensorflowGraphModel):
		# run eval data through the model
		n_tasks = self.n_tasks
		output = []
		start = time.time()
		with self._get_shared_session(train=False).as_default():
		feed_dict = self.construct_feed_dict(X)
		data = self._get_shared_session(train=False).run(

Admin message