Merge pull request #44 from rbharath/predict-new (8907dc23) · Commits · 钟慕尧 / deepchem

deep_chem/models/init.py

deleted100644 → 0

+0 −101

Original line number	Original line	Diff line number	Diff line
	"""
	Contains an abstract base class that supports different ML models.
	"""

	from __future__ import print_function
	from __future__ import division
	from __future__ import unicode_literals

	#TODO(enf/rbharath): incorporate save, load, eval, fit features into class Model.
	class Model(object):
	"""
	Abstract base class for different ML models.
	"""
	def __init__(self, task_types, model_params, initialize_raw_model=True):
	self.task_types = task_types
	self.model_params = model_params

	def fit_on_batch(self, X, y, w):
	"""
	Updates existing model with new information.
	"""
	raise NotImplementedError(
	"Each model is responsible for its own fit_on_batch method.")

	def predict_on_batch(self, X):
	"""
	Makes predictions on given batch of new data.
	"""
	raise NotImplementedError(
	"Each model is responsible for its own predict_on_batch method.")

	def set_raw_model(self, raw_model):
	"""
	Set underlying raw model. Useful when loading from disk.
	"""
	self.raw_model = raw_model

	def get_raw_model(self):
	"""
	Return raw model.
	"""
	return(self.raw_model)


	'''
	def model_predictions(X, model, n_targets, task_types, modeltype="sklearn"):
	"""Obtains predictions of provided model on test_set.

	Returns an ndarray of shape (n_samples, n_targets)

	TODO(rbharath): This function uses n_targets instead of
	task_transforms like everything else.

	Parameters
	----------
	X: numpy.ndarray
	Test set data.
	model: model.
	A trained scikit-learn or keras model.
	n_targets: int
	Number of output targets
	task_types: dict
	dict mapping target names to output type. Each output type must be either
	"classification" or "regression".
	modeltype: string
	Either sklearn, keras, or keras_multitask
	"""
	# Extract features for test set and make preds
	# TODO(rbharath): This change in shape should not(!) be handled here. Make
	# an upstream change so the evaluator doesn't have to worry about this.
	if len(np.shape(X)) > 2: # Dealing with 3D data
	if len(np.shape(X)) != 5:
	raise ValueError(
	"Tensorial datatype must be of shape (n_samples, N, N, N, n_channels).")
	(n_samples, axis_length, _, _, n_channels) = np.shape(X)
	X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
	if modeltype == "keras-graph":
	predictions = model.predict({"input": X})
	ypreds = []
	for index in range(n_targets):
	ypreds.append(predictions["task%d" % index])
	elif modeltype == "sklearn":
	# Must be single-task (breaking multitask RFs here)
	task_type = task_types.itervalues().next()
	if task_type == "classification":
	print("model_predictions()")
	print("np.shape(X)")
	print(np.shape(X))
	ypreds = model.predict_proba(X)
	elif task_type == "regression":
	ypreds = model.predict(X)
	elif modeltype == "keras-sequential":
	ypreds = model.predict(X)
	else:
	raise ValueError("Improper modeltype.")
	if isinstance(ypreds, np.ndarray):
	ypreds = np.squeeze(ypreds)
	if not isinstance(ypreds, list):
	ypreds = [ypreds]
	return ypreds
	'''

deep_chem/models/deep.py

deleted100644 → 0

+0 −207

Original line number	Original line	Diff line number	Diff line
	"""
	Code for processing the Google vs-datasets using keras.
	"""
	import numpy as np
	from keras.models import Graph
	from keras.layers.core import Dense, Dropout
	from keras.optimizers import SGD
	from deep_chem.models import Model

	#TODO(rbharath/enf): Make this real. It's a dummy now.
	class SingleTaskDNN(Model):
	"""
	Abstract base class for different ML models.
	"""
	def __init__(self, task_types, model_params, initialize_raw_model=True):
	self.task_types = task_types
	self.model_params = model_params
	self.raw_model = None

	def fit_on_batch(self, X, y, w):
	"""
	Updates existing model with new information.
	"""
	raise NotImplementedError(
	"Each model is responsible for its own fit_on_batch method.")

	def predict_on_batch(self, X):
	"""
	Makes predictions on given batch of new data.
	"""
	raise NotImplementedError(
	"Each model is responsible for its own predict_on_batch method.")

	#TODO(rbharath/enf): Make this real. It's a dummy now.
	class MultiTaskDNN(Model):
	"""
	Abstract base class for different ML models.
	"""
	def __init__(self, task_types, model_params, initialize_raw_model=True):
	self.task_types = task_types
	self.model_params = model_params
	self.raw_model = None

	def fit_on_batch(self, X, y, w):
	"""
	Updates existing model with new information.
	"""
	raise NotImplementedError(
	"Each model is responsible for its own fit_on_batch method.")

	def predict_on_batch(self, X):
	"""
	Makes predictions on given batch of new data.
	"""
	raise NotImplementedError(
	"Each model is responsible for its own predict_on_batch method.")

	def to_one_hot(y):
	"""Transforms label vector into one-hot encoding.

	Turns y into vector of shape [n_samples, 2] (assuming binary labels).

	y: np.ndarray
	A vector of shape [n_samples, 1]
	"""
	n_samples = np.shape(y)[0]
	y_hot = np.zeros((n_samples, 2))
	for index, val in enumerate(y):
	if val == 0:
	y_hot[index] = np.array([1, 0])
	elif val == 1:
	y_hot[index] = np.array([0, 1])
	return y_hot

	def fit_multitask_mlp(train_data, task_types, **training_params):
	"""
	Perform stochastic gradient descent optimization for a keras multitask MLP.
	Returns AUCs, R^2 scores, and RMS values.

	Parameters
	----------
	task_types: dict
	dict mapping task names to output type. Each output type must be either
	"classification" or "regression".
	training_params: dict
	Aggregates keyword parameters to pass to train_multitask_model
	"""
	models = {}
	# Follows convention from process_datasets that the data for multitask models
	# is grouped under key "all"
	X_train = train_data["features"]
	(y_train, W_train) = train_data["all"]
	models["all"] = train_multitask_model(X_train, y_train, W_train, task_types,
	**training_params)
	return models

	def fit_singletask_mlp(train_data, task_types, **training_params):
	"""
	Perform stochastic gradient descent optimization for a keras MLP.

	task_types: dict
	dict mapping task names to output type. Each output type must be either
	"classification" or "regression".
	output_transforms: dict
	dict mapping task names to label transform. Each output type must be either
	None or "log". Only for regression outputs.
	training_params: dict
	Aggregates keyword parameters to pass to train_multitask_model
	"""
	models = {}
	train_ids = train_data["mol_ids"]
	X_train = train_data["features"]
	sorted_tasks = train_data["sorted_tasks"]
	for index, task in enumerate(sorted_tasks):
	print "Training model %d" % index
	print "Target %s" % task
	(y_train, W_train) = train_data[task]
	flat_W_train = W_train.ravel()
	task_X_train = X_train[flat_W_train.nonzero()]
	task_y_train = y_train[flat_W_train.nonzero()]
	print "%d compounds in Train" % len(train_ids)
	models[task] = train_multitask_model(task_X_train, task_y_train, W_train,
	{task: task_types[task]},
	**training_params)
	return models

	def train_multitask_model(X, y, W, task_types, learning_rate=0.01,
	decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
	dropout=0.5, nb_epoch=20, batch_size=50, nb_hidden=500,
	validation_split=0.1):
	"""
	Perform stochastic gradient descent optimization for a keras multitask MLP.
	Returns a trained model.

	Parameters
	----------
	X: np.ndarray
	Feature matrix
	y: np.ndarray
	Label matrix
	W: np.ndarray
	Weight matrix
	task_types: dict
	dict mapping task names to output type. Each output type must be either
	"classification" or "regression".
	learning_rate: float
	Learning rate used.
	decay: float
	Learning rate decay.
	momentum: float
	Momentum used in SGD.
	nesterov: bool
	Use Nesterov acceleration
	nb_epoch: int
	maximal number of epochs to run the optimizer
	"""
	eps = .001
	sorted_tasks = sorted(task_types.keys())
	local_task_types = task_types.copy()
	endpoints = sorted_tasks
	print "train_multitask_model()"
	print "np.shape(X)"
	print np.shape(X)
	n_inputs = len(X[0].flatten())
	# Add eps weight to avoid minibatches with zero weight (causes theano to crash).
	W = W + eps * np.ones(np.shape(W))
	print "np.shape(W)"
	print np.shape(W)
	model = Graph()
	#model.add_input(name="input", ndim=n_inputs)
	model.add_input(name="input", input_shape=(n_inputs,))
	model.add_node(
	Dense(nb_hidden, init='uniform', activation=activation),
	name="dense", input="input")
	model.add_node(Dropout(dropout), name="dropout", input="dense")
	top_layer = "dropout"
	for ind, task in enumerate(endpoints):
	task_type = local_task_types[task]
	if task_type == "classification":
	model.add_node(
	Dense(2, init='uniform', activation="softmax"),
	name="dense_head%d" % ind, input=top_layer)
	elif task_type == "regression":
	model.add_node(
	Dense(1, init='uniform'),
	name="dense_head%d" % ind, input=top_layer)
	model.add_output(name="task%d" % ind, input="dense_head%d" % ind)
	data_dict, loss_dict, sample_weights = {}, {}, {}
	data_dict["input"] = X
	for ind, task in enumerate(endpoints):
	task_type = local_task_types[task]
	taskname = "task%d" % ind
	sample_weights[taskname] = W[:, ind]
	if task_type == "classification":
	loss_dict[taskname] = "binary_crossentropy"
	data_dict[taskname] = to_one_hot(y[:, ind])
	elif task_type == "regression":
	loss_dict[taskname] = "mean_squared_error"
	data_dict[taskname] = y[:, ind]
	sgd = SGD(lr=learning_rate, decay=decay, momentum=momentum, nesterov=nesterov)
	print "About to compile model!"
	model.compile(optimizer=sgd, loss=loss_dict)
	print "Done compiling. About to fit model!"
	print "validation_split: " + str(validation_split)
	model.fit(data_dict, nb_epoch=nb_epoch, batch_size=batch_size,
	validation_split=validation_split, sample_weight=sample_weights)
	return model

deep_chem/models/model.py

deleted100644 → 0

+0 −30

Original line number	Original line	Diff line number	Diff line
	"""
	Factory function to construct models.
	"""

	from __future__ import print_function
	from __future__ import division
	from __future__ import unicode_literals

	from deep_chem.models.deep import SingleTaskDNN
	from deep_chem.models.deep import MultiTaskDNN
	from deep_chem.models.deep3d import DockingDNN
	from deep_chem.models.standard import SklearnModel

	def model_builder(model_type, task_types, model_params,
	initialize_raw_model=True):
	"""
	Factory function to construct model.
	"""
	if model_type == "singletask_deep_network":
	model = SingleTaskDNN(task_types, model_params,
	initialize_raw_model)
	elif model_type == "multitask_deep_network":
	model = MultiTaskDNN(task_types, model_params,
	initialize_raw_model)
	elif model_type == "convolutional_3D_regressor":
	model = DockingDNN(task_types, model_params,
	initialize_raw_model)
	else:
	model = SklearnModel(task_types, model_params)
	return model

deep_chem/models/standard.py

deleted100644 → 0

+0 −86

Original line number	Original line	Diff line number	Diff line
	"""
	Code for processing datasets using scikit-learn.
	"""
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.linear_model import LogisticRegression
	from sklearn.linear_model import LinearRegression
	from sklearn.linear_model import RidgeCV
	from sklearn.linear_model import LassoCV
	from sklearn.linear_model import ElasticNetCV
	from sklearn.linear_model import LassoLarsCV
	from deep_chem.models import Model

	class SklearnModel(Model):
	"""
	Abstract base class for different ML models.
	"""
	def __init__(self, task_types, model_params, initialize_raw_model=True):
	self.task_types = task_types
	self.model_params = model_params
	self.raw_model = None

	def fit_on_batch(self, X, y, w):
	"""
	Updates existing model with new information.
	"""
	raise NotImplementedError(
	"Each model is responsible for its own fit_on_batch method.")

	def predict_on_batch(self, X):
	"""
	Makes predictions on given batch of new data.
	"""
	raise NotImplementedError(
	"Each model is responsible for its own predict_on_batch method.")

	def fit_singletask_models(train_data, modeltype):
	"""Fits singletask linear regression models to potency.

	Parameters
	----------
	paths: list
	List of paths to datasets.
	modeltype: String
	A string describing the model to be trained. Options are RandomForest,
	splittype: string
	Type of split for train/test. Either random or scaffold.
	seed: int (optional)
	Seed to initialize np.random.
	output_transforms: dict
	dict mapping task names to label transform. Each output type must be either
	None or "log". Only for regression outputs.
	"""
	models = {}
	import numpy as np
	X_train = train_data["features"]
	sorted_tasks = train_data["sorted_tasks"]
	for task in sorted_tasks:
	print "Building model for task %s" % task
	(y_train, W_train) = train_data[task]
	W_train = W_train.ravel()
	task_X_train = X_train[W_train.nonzero()]
	task_y_train = y_train[W_train.nonzero()]
	if modeltype == "rf_regressor":
	model = RandomForestRegressor(
	n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
	elif modeltype == "rf_classifier":
	model = RandomForestClassifier(
	n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
	elif modeltype == "logistic":
	model = LogisticRegression(class_weight="auto")
	elif modeltype == "linear":
	model = LinearRegression(normalize=True)
	elif modeltype == "ridge":
	model = RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0], normalize=True)
	elif modeltype == "lasso":
	model = LassoCV(max_iter=2000, n_jobs=-1)
	elif modeltype == "lasso_lars":
	model = LassoLarsCV(max_iter=2000, n_jobs=-1)
	elif modeltype == "elastic_net":
	model = ElasticNetCV(max_iter=2000, n_jobs=-1)
	else:
	raise ValueError("Invalid model type provided.")
	model.fit(task_X_train, task_y_train.ravel())
	models[task] = model
	return models

deep_chem/scripts/process_bace.sh

deleted100755 → 0

+0 −2

# Usage ./process_bace.sh INPUT_SDF_FILE OUT_DIR DATASET_NAME

python -m deep_chem.scripts.modeler featurize --input-file $1 --input-type sdf --fields Name smiles pIC50 Model --field-types string string float string --name $3 --out $2 --prediction-endpoint pIC50

Admin message