Merge pull request #35 from evanfeinberg/master (415033d0) · Commits · 钟慕尧 / deepchem

deep_chem/models/init.py

+101 −0

Original line number	Diff line number	Diff line
		"""
		Contains an abstract base class that supports different ML models.
		"""

		from __future__ import print_function
		from __future__ import division
		from __future__ import unicode_literals

		#TODO(enf/rbharath): incorporate save, load, eval, fit features into class Model.
		class Model(object):
		"""
		Abstract base class for different ML models.
		"""
		def __init__(self, task_types, model_params, initialize_raw_model=True):
		self.task_types = task_types
		self.model_params = model_params

		def fit_on_batch(self, X, y, w):
		"""
		Updates existing model with new information.
		"""
		raise NotImplementedError(
		"Each model is responsible for its own fit_on_batch method.")

		def predict_on_batch(self, X):
		"""
		Makes predictions on given batch of new data.
		"""
		raise NotImplementedError(
		"Each model is responsible for its own predict_on_batch method.")

		def set_raw_model(self, raw_model):
		"""
		Set underlying raw model. Useful when loading from disk.
		"""
		self.raw_model = raw_model

		def get_raw_model(self):
		"""
		Return raw model.
		"""
		return(self.raw_model)


		'''
		def model_predictions(X, model, n_targets, task_types, modeltype="sklearn"):
		"""Obtains predictions of provided model on test_set.

		Returns an ndarray of shape (n_samples, n_targets)

		TODO(rbharath): This function uses n_targets instead of
		task_transforms like everything else.

		Parameters
		----------
		X: numpy.ndarray
		Test set data.
		model: model.
		A trained scikit-learn or keras model.
		n_targets: int
		Number of output targets
		task_types: dict
		dict mapping target names to output type. Each output type must be either
		"classification" or "regression".
		modeltype: string
		Either sklearn, keras, or keras_multitask
		"""
		# Extract features for test set and make preds
		# TODO(rbharath): This change in shape should not(!) be handled here. Make
		# an upstream change so the evaluator doesn't have to worry about this.
		if len(np.shape(X)) > 2: # Dealing with 3D data
		if len(np.shape(X)) != 5:
		raise ValueError(
		"Tensorial datatype must be of shape (n_samples, N, N, N, n_channels).")
		(n_samples, axis_length, _, _, n_channels) = np.shape(X)
		X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
		if modeltype == "keras-graph":
		predictions = model.predict({"input": X})
		ypreds = []
		for index in range(n_targets):
		ypreds.append(predictions["task%d" % index])
		elif modeltype == "sklearn":
		# Must be single-task (breaking multitask RFs here)
		task_type = task_types.itervalues().next()
		if task_type == "classification":
		print("model_predictions()")
		print("np.shape(X)")
		print(np.shape(X))
		ypreds = model.predict_proba(X)
		elif task_type == "regression":
		ypreds = model.predict(X)
		elif modeltype == "keras-sequential":
		ypreds = model.predict(X)
		else:
		raise ValueError("Improper modeltype.")
		if isinstance(ypreds, np.ndarray):
		ypreds = np.squeeze(ypreds)
		if not isinstance(ypreds, list):
		ypreds = [ypreds]
		return ypreds
		'''

deep_chem/models/deep.py

+67 −3

Original line number	Diff line number	Diff line
		@@ -5,8 +5,72 @@ import numpy as np
		from keras.models import Graph
		from keras.layers.core import Dense, Dropout
		from keras.optimizers import SGD
		from deep_chem.utils.preprocess import to_one_hot
		from deep_chem.models import Model

		#TODO(rbharath/enf): Make this real. It's a dummy now.
		class SingleTaskDNN(Model):
		"""
		Abstract base class for different ML models.
		"""
		def __init__(self, task_types, model_params, initialize_raw_model=True):
		self.task_types = task_types
		self.model_params = model_params
		self.raw_model = None

		def fit_on_batch(self, X, y, w):
		"""
		Updates existing model with new information.
		"""
		raise NotImplementedError(
		"Each model is responsible for its own fit_on_batch method.")

		def predict_on_batch(self, X):
		"""
		Makes predictions on given batch of new data.
		"""
		raise NotImplementedError(
		"Each model is responsible for its own predict_on_batch method.")

		#TODO(rbharath/enf): Make this real. It's a dummy now.
		class MultiTaskDNN(Model):
		"""
		Abstract base class for different ML models.
		"""
		def __init__(self, task_types, model_params, initialize_raw_model=True):
		self.task_types = task_types
		self.model_params = model_params
		self.raw_model = None

		def fit_on_batch(self, X, y, w):
		"""
		Updates existing model with new information.
		"""
		raise NotImplementedError(
		"Each model is responsible for its own fit_on_batch method.")

		def predict_on_batch(self, X):
		"""
		Makes predictions on given batch of new data.
		"""
		raise NotImplementedError(
		"Each model is responsible for its own predict_on_batch method.")

		def to_one_hot(y):
		"""Transforms label vector into one-hot encoding.

		Turns y into vector of shape [n_samples, 2] (assuming binary labels).

		y: np.ndarray
		A vector of shape [n_samples, 1]
		"""
		n_samples = np.shape(y)[0]
		y_hot = np.zeros((n_samples, 2))
		for index, val in enumerate(y):
		if val == 0:
		y_hot[index] = np.array([1, 0])
		elif val == 1:
		y_hot[index] = np.array([0, 1])
		return y_hot

		def fit_multitask_mlp(train_data, task_types, **training_params):
		"""
		@@ -62,7 +126,7 @@ def fit_singletask_mlp(train_data, task_types, **training_params):

		def train_multitask_model(X, y, W, task_types, learning_rate=0.01,
		decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
		dropout=0.5, nb_epoch=20, batch_size=50, n_hidden=500,
		dropout=0.5, nb_epoch=20, batch_size=50, nb_hidden=500,
		validation_split=0.1):
		"""
		Perform stochastic gradient descent optimization for a keras multitask MLP.
		@@ -106,7 +170,7 @@ def train_multitask_model(X, y, W, task_types, learning_rate=0.01,
		#model.add_input(name="input", ndim=n_inputs)
		model.add_input(name="input", input_shape=(n_inputs,))
		model.add_node(
		Dense(n_hidden, init='uniform', activation=activation),
		Dense(nb_hidden, init='uniform', activation=activation),
		name="dense", input="input")
		model.add_node(Dropout(dropout), name="dropout", input="dense")
		top_layer = "dropout"

deep_chem/models/deep3d.py

+72 −62

Original line number	Diff line number	Diff line
		"""
		Code for training 3D convolutions.
		"""

		from __future__ import print_function
		from __future__ import division
		from __future__ import unicode_literals

		import numpy as np
		from keras.optimizers import RMSprop
		from keras.models import Sequential
		from keras.layers.core import Dense, Dropout, Activation, Flatten
		from keras.layers.convolutional import Convolution3D, MaxPooling3D
		from deep_chem.models import Model

		def fit_3D_convolution(train_data, **training_params):
		"""
		Perform stochastic gradient descent for a 3D CNN.
		"""
		models = {}
		X_train = train_data["features"]
		if len(train_data["sorted_tasks"]) > 1:
		raise ValueError("3D Convolutions only supported for singletask.")
		task_name = train_data["sorted_tasks"][0]
		(y_train, _) = train_data["sorted_tasks"].itervalues().next()
		models[task_name] = train_3D_convolution(X_train, y_train, **training_params)
		return models
		def shuffle_shape(shape):
		(axis_length, _, _, n_channels) = shape
		shuffled_shape = (n_channels, axis_length, axis_length, axis_length)
		return shuffled_shape

		def train_3D_convolution(X, y, batch_size=50, nb_epoch=1, learning_rate=0.01,
		loss_function="mean_squared_error"):
		def shuffle_data(X):
		(n_samples, axis_length, _, _, n_channels) = np.shape(X)
		X = np.reshape(X, (n_samples, n_channels, axis_length, axis_length, axis_length))
		return X

		"""
		Fit a keras 3D CNN to datat.

		Parameters
		----------
		nb_epoch: int
		maximal number of epochs to run the optimizer
		class DockingDNN(Model):
		"""
		print "Training 3D model"
		print "Original shape of X: " + str(np.shape(X))
		print "Shuffling X dimensions to match convnet"
		# TODO(rbharath): Modify the featurization so that it matches desired shaped.
		(n_samples, axis_length, _, _, n_channels) = np.shape(X)
		X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
		print "Final shape of X: " + str(np.shape(X))
		Wrapper class for fitting 3D convolutional networks for deep docking.
		"""
		def __init__(self, task_types, model_params, initialize_raw_model=True):
		super(DockingDNN, self).__init__(task_types, model_params, initialize_raw_model)
		if initialize_raw_model:
		(axis_length, _, _, n_channels) = model_params["data_shape"]
		self.input_shape = (n_channels,
		axis_length, axis_length, axis_length)

		learning_rate = model_params["learning_rate"]
		loss_function = model_params["loss_function"]

		# number of convolutional filters to use at each layer
		nb_filters = [axis_length/2, axis_length, axis_length]
		@@ -47,35 +46,46 @@ def train_3D_convolution(X, y, batch_size=50, nb_epoch=1, learning_rate=0.01,

		# level of convolution to perform at each layer (CONV x CONV)
		nb_conv = [7, 5, 3]

		model = Sequential()
		model.add(Convolution3D(nb_filter=nb_filters[0], stack_size=n_channels,

		model.add(Convolution3D(nb_filter=nb_filters[0], nb_depth=nb_conv[0],
		nb_row=nb_conv[0], nb_col=nb_conv[0],
		nb_depth=nb_conv[0], border_mode='valid'))
		input_shape=self.input_shape, border_mode="full"))
		model.add(Activation('relu'))
		model.add(MaxPooling3D(poolsize=(nb_pool[0], nb_pool[0], nb_pool[0])))
		model.add(Convolution3D(nb_filter=nb_filters[1], stack_size=nb_filters[0],
		nb_row=nb_conv[1], nb_col=nb_conv[1], nb_depth=nb_conv[1],
		border_mode='valid'))

		model.add(MaxPooling3D(pool_size=(nb_pool[0], nb_pool[0], nb_pool[0])))
		model.add(Convolution3D(nb_filter=nb_filters[1], nb_depth=nb_conv[1],
		nb_row=nb_conv[1], nb_col=nb_conv[1], border_mode="full"))
		model.add(Activation('relu'))
		model.add(MaxPooling3D(poolsize=(nb_pool[1], nb_pool[1], nb_pool[1])))
		model.add(Convolution3D(nb_filter=nb_filters[2], stack_size=nb_filters[1],
		nb_row=nb_conv[2], nb_col=nb_conv[2],
		nb_depth=nb_conv[2], border_mode='valid'))
		model.add(MaxPooling3D(pool_size=(nb_pool[1], nb_pool[1], nb_pool[1])))
		model.add(Convolution3D(nb_filter=nb_filters[2], nb_depth=nb_conv[2],
		nb_row=nb_conv[2], nb_col=nb_conv[2], border_mode="full"))
		model.add(Activation('relu'))
		model.add(MaxPooling3D(poolsize=(nb_pool[2], nb_pool[2], nb_pool[2])))
		model.add(MaxPooling3D(pool_size=(nb_pool[2], nb_pool[2], nb_pool[2])))
		model.add(Flatten())
		# TODO(rbharath): If we change away from axis-size 32, this code will break.
		# Eventually figure out a more general rule that works for all axis sizes.
		model.add(Dense(32/2, init='normal'))
		model.add(Dense(16, init='normal'))
		model.add(Activation('relu'))
		model.add(Dropout(0.5))
		# TODO(rbharath): Generalize this to support classification as well as regression.
		model.add(Dense(1, init='normal'))

		sgd = RMSprop(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
		print "About to compile model"
		print("About to compile model")
		model.compile(loss=loss_function, optimizer=sgd)
		print "About to fit data to model."
		model.fit(X, y, batch_size=batch_size, nb_epoch=nb_epoch)
		return model
		self.raw_model = model

		def fit_on_batch(self, X, y, w):
		# TODO(rbharath): Modify the featurization so that it matches desired shaped.
		X = shuffle_data(X)
		loss = self.raw_model.train_on_batch(X, y)
		print("Loss: %f" % loss)

		def predict_on_batch(self, X):
		if len(np.shape(X)) != 5:
		raise ValueError(
		"Tensorial datatype must be of shape (n_samples, N, N, N, n_channels).")
		X = shuffle_data(X)
		y_pred = self.raw_model.predict_on_batch(X)
		y_pred = np.squeeze(y_pred)
		return y_pred

deep_chem/models/model.py

0 → 100644

+30 −0

Original line number	Diff line number	Diff line
		"""
		Factory function to construct models.
		"""

		from __future__ import print_function
		from __future__ import division
		from __future__ import unicode_literals

		from deep_chem.models.deep import SingleTaskDNN
		from deep_chem.models.deep import MultiTaskDNN
		from deep_chem.models.deep3d import DockingDNN
		from deep_chem.models.standard import SklearnModel

		def model_builder(model_type, task_types, model_params,
		initialize_raw_model=True):
		"""
		Factory function to construct model.
		"""
		if model_type == "singletask_deep_network":
		model = SingleTaskDNN(task_types, model_params,
		initialize_raw_model)
		elif model_type == "multitask_deep_network":
		model = MultiTaskDNN(task_types, model_params,
		initialize_raw_model)
		elif model_type == "convolutional_3D_regressor":
		model = DockingDNN(task_types, model_params,
		initialize_raw_model)
		else:
		model = SklearnModel(task_types, model_params)
		return model

deep_chem/models/standard.py

+24 −0

Original line number	Diff line number	Diff line
		@@ -9,6 +9,30 @@ from sklearn.linear_model import RidgeCV
		from sklearn.linear_model import LassoCV
		from sklearn.linear_model import ElasticNetCV
		from sklearn.linear_model import LassoLarsCV
		from deep_chem.models import Model

		class SklearnModel(Model):
		"""
		Abstract base class for different ML models.
		"""
		def __init__(self, task_types, model_params, initialize_raw_model=True):
		self.task_types = task_types
		self.model_params = model_params
		self.raw_model = None

		def fit_on_batch(self, X, y, w):
		"""
		Updates existing model with new information.
		"""
		raise NotImplementedError(
		"Each model is responsible for its own fit_on_batch method.")

		def predict_on_batch(self, X):
		"""
		Makes predictions on given batch of new data.
		"""
		raise NotImplementedError(
		"Each model is responsible for its own predict_on_batch method.")

		def fit_singletask_models(train_data, modeltype):
		"""Fits singletask linear regression models to potency.

Admin message