Merge pull request #35 from evanfeinberg/master (415033d0) · Commits · 钟慕尧 / deepchem

deep_chem/models/init.py

+101 −0

Original line number	Original line	Diff line number	Diff line
			"""
			Contains an abstract base class that supports different ML models.
			"""

			from __future__ import print_function
			from __future__ import division
			from __future__ import unicode_literals

			#TODO(enf/rbharath): incorporate save, load, eval, fit features into class Model.
			class Model(object):
			"""
			Abstract base class for different ML models.
			"""
			def __init__(self, task_types, model_params, initialize_raw_model=True):
			self.task_types = task_types
			self.model_params = model_params

			def fit_on_batch(self, X, y, w):
			"""
			Updates existing model with new information.
			"""
			raise NotImplementedError(
			"Each model is responsible for its own fit_on_batch method.")

			def predict_on_batch(self, X):
			"""
			Makes predictions on given batch of new data.
			"""
			raise NotImplementedError(
			"Each model is responsible for its own predict_on_batch method.")

			def set_raw_model(self, raw_model):
			"""
			Set underlying raw model. Useful when loading from disk.
			"""
			self.raw_model = raw_model

			def get_raw_model(self):
			"""
			Return raw model.
			"""
			return(self.raw_model)


			'''
			def model_predictions(X, model, n_targets, task_types, modeltype="sklearn"):
			"""Obtains predictions of provided model on test_set.

			Returns an ndarray of shape (n_samples, n_targets)

			TODO(rbharath): This function uses n_targets instead of
			task_transforms like everything else.

			Parameters
			----------
			X: numpy.ndarray
			Test set data.
			model: model.
			A trained scikit-learn or keras model.
			n_targets: int
			Number of output targets
			task_types: dict
			dict mapping target names to output type. Each output type must be either
			"classification" or "regression".
			modeltype: string
			Either sklearn, keras, or keras_multitask
			"""
			# Extract features for test set and make preds
			# TODO(rbharath): This change in shape should not(!) be handled here. Make
			# an upstream change so the evaluator doesn't have to worry about this.
			if len(np.shape(X)) > 2: # Dealing with 3D data
			if len(np.shape(X)) != 5:
			raise ValueError(
			"Tensorial datatype must be of shape (n_samples, N, N, N, n_channels).")
			(n_samples, axis_length, _, _, n_channels) = np.shape(X)
			X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
			if modeltype == "keras-graph":
			predictions = model.predict({"input": X})
			ypreds = []
			for index in range(n_targets):
			ypreds.append(predictions["task%d" % index])
			elif modeltype == "sklearn":
			# Must be single-task (breaking multitask RFs here)
			task_type = task_types.itervalues().next()
			if task_type == "classification":
			print("model_predictions()")
			print("np.shape(X)")
			print(np.shape(X))
			ypreds = model.predict_proba(X)
			elif task_type == "regression":
			ypreds = model.predict(X)
			elif modeltype == "keras-sequential":
			ypreds = model.predict(X)
			else:
			raise ValueError("Improper modeltype.")
			if isinstance(ypreds, np.ndarray):
			ypreds = np.squeeze(ypreds)
			if not isinstance(ypreds, list):
			ypreds = [ypreds]
			return ypreds
			'''

deep_chem/models/deep.py

+67 −3

Original line number	Original line	Diff line number	Diff line
	@@ -5,8 +5,72 @@ import numpy as np
	from keras.models import Graph		from keras.models import Graph
	from keras.layers.core import Dense, Dropout		from keras.layers.core import Dense, Dropout
	from keras.optimizers import SGD		from keras.optimizers import SGD
	from deep_chem.utils.preprocess import to_one_hot		from deep_chem.models import Model

			#TODO(rbharath/enf): Make this real. It's a dummy now.
			class SingleTaskDNN(Model):
			"""
			Abstract base class for different ML models.
			"""
			def __init__(self, task_types, model_params, initialize_raw_model=True):
			self.task_types = task_types
			self.model_params = model_params
			self.raw_model = None

			def fit_on_batch(self, X, y, w):
			"""
			Updates existing model with new information.
			"""
			raise NotImplementedError(
			"Each model is responsible for its own fit_on_batch method.")

			def predict_on_batch(self, X):
			"""
			Makes predictions on given batch of new data.
			"""
			raise NotImplementedError(
			"Each model is responsible for its own predict_on_batch method.")

			#TODO(rbharath/enf): Make this real. It's a dummy now.
			class MultiTaskDNN(Model):
			"""
			Abstract base class for different ML models.
			"""
			def __init__(self, task_types, model_params, initialize_raw_model=True):
			self.task_types = task_types
			self.model_params = model_params
			self.raw_model = None

			def fit_on_batch(self, X, y, w):
			"""
			Updates existing model with new information.
			"""
			raise NotImplementedError(
			"Each model is responsible for its own fit_on_batch method.")

			def predict_on_batch(self, X):
			"""
			Makes predictions on given batch of new data.
			"""
			raise NotImplementedError(
			"Each model is responsible for its own predict_on_batch method.")

			def to_one_hot(y):
			"""Transforms label vector into one-hot encoding.

			Turns y into vector of shape [n_samples, 2] (assuming binary labels).

			y: np.ndarray
			A vector of shape [n_samples, 1]
			"""
			n_samples = np.shape(y)[0]
			y_hot = np.zeros((n_samples, 2))
			for index, val in enumerate(y):
			if val == 0:
			y_hot[index] = np.array([1, 0])
			elif val == 1:
			y_hot[index] = np.array([0, 1])
			return y_hot

	def fit_multitask_mlp(train_data, task_types, **training_params):		def fit_multitask_mlp(train_data, task_types, **training_params):
	"""		"""
	@@ -62,7 +126,7 @@ def fit_singletask_mlp(train_data, task_types, **training_params):

	def train_multitask_model(X, y, W, task_types, learning_rate=0.01,		def train_multitask_model(X, y, W, task_types, learning_rate=0.01,
	decay=1e-6, momentum=0.9, nesterov=True, activation="relu",		decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
	dropout=0.5, nb_epoch=20, batch_size=50, n_hidden=500,		dropout=0.5, nb_epoch=20, batch_size=50, nb_hidden=500,
	validation_split=0.1):		validation_split=0.1):
	"""		"""
	Perform stochastic gradient descent optimization for a keras multitask MLP.		Perform stochastic gradient descent optimization for a keras multitask MLP.
	@@ -106,7 +170,7 @@ def train_multitask_model(X, y, W, task_types, learning_rate=0.01,
	#model.add_input(name="input", ndim=n_inputs)		#model.add_input(name="input", ndim=n_inputs)
	model.add_input(name="input", input_shape=(n_inputs,))		model.add_input(name="input", input_shape=(n_inputs,))
	model.add_node(		model.add_node(
	Dense(n_hidden, init='uniform', activation=activation),		Dense(nb_hidden, init='uniform', activation=activation),
	name="dense", input="input")		name="dense", input="input")
	model.add_node(Dropout(dropout), name="dropout", input="dense")		model.add_node(Dropout(dropout), name="dropout", input="dense")
	top_layer = "dropout"		top_layer = "dropout"

deep_chem/models/deep3d.py

+72 −62

Original line number	Original line	Diff line number	Diff line
	"""		"""
	Code for training 3D convolutions.		Code for training 3D convolutions.
	"""		"""

			from __future__ import print_function
			from __future__ import division
			from __future__ import unicode_literals

	import numpy as np		import numpy as np
	from keras.optimizers import RMSprop		from keras.optimizers import RMSprop
	from keras.models import Sequential		from keras.models import Sequential
	from keras.layers.core import Dense, Dropout, Activation, Flatten		from keras.layers.core import Dense, Dropout, Activation, Flatten
	from keras.layers.convolutional import Convolution3D, MaxPooling3D		from keras.layers.convolutional import Convolution3D, MaxPooling3D
			from deep_chem.models import Model

	def fit_3D_convolution(train_data, **training_params):		def shuffle_shape(shape):
	"""		(axis_length, _, _, n_channels) = shape
	Perform stochastic gradient descent for a 3D CNN.		shuffled_shape = (n_channels, axis_length, axis_length, axis_length)
	"""		return shuffled_shape
	models = {}
	X_train = train_data["features"]
	if len(train_data["sorted_tasks"]) > 1:
	raise ValueError("3D Convolutions only supported for singletask.")
	task_name = train_data["sorted_tasks"][0]
	(y_train, _) = train_data["sorted_tasks"].itervalues().next()
	models[task_name] = train_3D_convolution(X_train, y_train, **training_params)
	return models

	def train_3D_convolution(X, y, batch_size=50, nb_epoch=1, learning_rate=0.01,		def shuffle_data(X):
	loss_function="mean_squared_error"):		(n_samples, axis_length, _, _, n_channels) = np.shape(X)
			X = np.reshape(X, (n_samples, n_channels, axis_length, axis_length, axis_length))
			return X

	"""
	Fit a keras 3D CNN to datat.

	Parameters		class DockingDNN(Model):
	----------
	nb_epoch: int
	maximal number of epochs to run the optimizer
	"""		"""
	print "Training 3D model"		Wrapper class for fitting 3D convolutional networks for deep docking.
	print "Original shape of X: " + str(np.shape(X))		"""
	print "Shuffling X dimensions to match convnet"		def __init__(self, task_types, model_params, initialize_raw_model=True):
	# TODO(rbharath): Modify the featurization so that it matches desired shaped.		super(DockingDNN, self).__init__(task_types, model_params, initialize_raw_model)
	(n_samples, axis_length, _, _, n_channels) = np.shape(X)		if initialize_raw_model:
	X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))		(axis_length, _, _, n_channels) = model_params["data_shape"]
	print "Final shape of X: " + str(np.shape(X))		self.input_shape = (n_channels,
			axis_length, axis_length, axis_length)

			learning_rate = model_params["learning_rate"]
			loss_function = model_params["loss_function"]

	# number of convolutional filters to use at each layer		# number of convolutional filters to use at each layer
	nb_filters = [axis_length/2, axis_length, axis_length]		nb_filters = [axis_length/2, axis_length, axis_length]
	@@ -47,35 +46,46 @@ def train_3D_convolution(X, y, batch_size=50, nb_epoch=1, learning_rate=0.01,

	# level of convolution to perform at each layer (CONV x CONV)		# level of convolution to perform at each layer (CONV x CONV)
	nb_conv = [7, 5, 3]		nb_conv = [7, 5, 3]

	model = Sequential()		model = Sequential()
	model.add(Convolution3D(nb_filter=nb_filters[0], stack_size=n_channels,
			model.add(Convolution3D(nb_filter=nb_filters[0], nb_depth=nb_conv[0],
	nb_row=nb_conv[0], nb_col=nb_conv[0],		nb_row=nb_conv[0], nb_col=nb_conv[0],
	nb_depth=nb_conv[0], border_mode='valid'))		input_shape=self.input_shape, border_mode="full"))
	model.add(Activation('relu'))		model.add(Activation('relu'))
	model.add(MaxPooling3D(poolsize=(nb_pool[0], nb_pool[0], nb_pool[0])))
	model.add(Convolution3D(nb_filter=nb_filters[1], stack_size=nb_filters[0],		model.add(MaxPooling3D(pool_size=(nb_pool[0], nb_pool[0], nb_pool[0])))
	nb_row=nb_conv[1], nb_col=nb_conv[1], nb_depth=nb_conv[1],		model.add(Convolution3D(nb_filter=nb_filters[1], nb_depth=nb_conv[1],
	border_mode='valid'))		nb_row=nb_conv[1], nb_col=nb_conv[1], border_mode="full"))
	model.add(Activation('relu'))		model.add(Activation('relu'))
	model.add(MaxPooling3D(poolsize=(nb_pool[1], nb_pool[1], nb_pool[1])))		model.add(MaxPooling3D(pool_size=(nb_pool[1], nb_pool[1], nb_pool[1])))
	model.add(Convolution3D(nb_filter=nb_filters[2], stack_size=nb_filters[1],		model.add(Convolution3D(nb_filter=nb_filters[2], nb_depth=nb_conv[2],
	nb_row=nb_conv[2], nb_col=nb_conv[2],		nb_row=nb_conv[2], nb_col=nb_conv[2], border_mode="full"))
	nb_depth=nb_conv[2], border_mode='valid'))
	model.add(Activation('relu'))		model.add(Activation('relu'))
	model.add(MaxPooling3D(poolsize=(nb_pool[2], nb_pool[2], nb_pool[2])))		model.add(MaxPooling3D(pool_size=(nb_pool[2], nb_pool[2], nb_pool[2])))
	model.add(Flatten())		model.add(Flatten())
	# TODO(rbharath): If we change away from axis-size 32, this code will break.		# TODO(rbharath): If we change away from axis-size 32, this code will break.
	# Eventually figure out a more general rule that works for all axis sizes.		# Eventually figure out a more general rule that works for all axis sizes.
	model.add(Dense(32/2, init='normal'))		model.add(Dense(16, init='normal'))
	model.add(Activation('relu'))		model.add(Activation('relu'))
	model.add(Dropout(0.5))		model.add(Dropout(0.5))
	# TODO(rbharath): Generalize this to support classification as well as regression.
	model.add(Dense(1, init='normal'))		model.add(Dense(1, init='normal'))

	sgd = RMSprop(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)		sgd = RMSprop(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
	print "About to compile model"		print("About to compile model")
	model.compile(loss=loss_function, optimizer=sgd)		model.compile(loss=loss_function, optimizer=sgd)
	print "About to fit data to model."		self.raw_model = model
	model.fit(X, y, batch_size=batch_size, nb_epoch=nb_epoch)
	return model		def fit_on_batch(self, X, y, w):
			# TODO(rbharath): Modify the featurization so that it matches desired shaped.
			X = shuffle_data(X)
			loss = self.raw_model.train_on_batch(X, y)
			print("Loss: %f" % loss)

			def predict_on_batch(self, X):
			if len(np.shape(X)) != 5:
			raise ValueError(
			"Tensorial datatype must be of shape (n_samples, N, N, N, n_channels).")
			X = shuffle_data(X)
			y_pred = self.raw_model.predict_on_batch(X)
			y_pred = np.squeeze(y_pred)
			return y_pred

deep_chem/models/model.py

0 → 100644

+30 −0

Original line number	Original line	Diff line number	Diff line
			"""
			Factory function to construct models.
			"""

			from __future__ import print_function
			from __future__ import division
			from __future__ import unicode_literals

			from deep_chem.models.deep import SingleTaskDNN
			from deep_chem.models.deep import MultiTaskDNN
			from deep_chem.models.deep3d import DockingDNN
			from deep_chem.models.standard import SklearnModel

			def model_builder(model_type, task_types, model_params,
			initialize_raw_model=True):
			"""
			Factory function to construct model.
			"""
			if model_type == "singletask_deep_network":
			model = SingleTaskDNN(task_types, model_params,
			initialize_raw_model)
			elif model_type == "multitask_deep_network":
			model = MultiTaskDNN(task_types, model_params,
			initialize_raw_model)
			elif model_type == "convolutional_3D_regressor":
			model = DockingDNN(task_types, model_params,
			initialize_raw_model)
			else:
			model = SklearnModel(task_types, model_params)
			return model

deep_chem/models/standard.py

+24 −0

Original line number	Original line	Diff line number	Diff line
	@@ -9,6 +9,30 @@ from sklearn.linear_model import RidgeCV
	from sklearn.linear_model import LassoCV		from sklearn.linear_model import LassoCV
	from sklearn.linear_model import ElasticNetCV		from sklearn.linear_model import ElasticNetCV
	from sklearn.linear_model import LassoLarsCV		from sklearn.linear_model import LassoLarsCV
			from deep_chem.models import Model

			class SklearnModel(Model):
			"""
			Abstract base class for different ML models.
			"""
			def __init__(self, task_types, model_params, initialize_raw_model=True):
			self.task_types = task_types
			self.model_params = model_params
			self.raw_model = None

			def fit_on_batch(self, X, y, w):
			"""
			Updates existing model with new information.
			"""
			raise NotImplementedError(
			"Each model is responsible for its own fit_on_batch method.")

			def predict_on_batch(self, X):
			"""
			Makes predictions on given batch of new data.
			"""
			raise NotImplementedError(
			"Each model is responsible for its own predict_on_batch method.")

	def fit_singletask_models(train_data, modeltype):		def fit_singletask_models(train_data, modeltype):
	"""Fits singletask linear regression models to potency.		"""Fits singletask linear regression models to potency.

Admin message