pylint fixes and python3 upgrades (f4a665f8) · Commits · 钟慕尧 / deepchem

deep_chem/models/deep.py

+17 −19

Original line number	Diff line number	Diff line
		@@ -2,10 +2,8 @@
		Code for processing the Google vs-datasets using keras.
		"""
		import numpy as np
		import keras
		from keras.models import Graph
		from keras.models import Sequential
		from keras.layers.core import Dense, Dropout, Activation
		from keras.layers.core import Dense, Dropout
		from keras.optimizers import SGD
		from deep_chem.utils.preprocess import to_one_hot

		@@ -58,11 +56,12 @@ def fit_singletask_mlp(train_data, task_types, **training_params):
		task_y_train = y_train[flat_W_train.nonzero()]
		print "%d compounds in Train" % len(train_ids)
		models[task] = train_multitask_model(task_X_train, task_y_train, W_train,
		{task: task_types[task]}, **training_params)
		{task: task_types[task]},
		**training_params)
		return models

		def train_multitask_model(X, y, W, task_types,
		learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
		def train_multitask_model(X, y, W, task_types, learning_rate=0.01,
		decay=1e-6, momentum=0.9, nesterov=True, activation="relu",
		dropout=0.5, nb_epoch=20, batch_size=50, n_hidden=500,
		validation_split=0.1):
		"""
		@@ -92,7 +91,6 @@ def train_multitask_model(X, y, W, task_types,
		maximal number of epochs to run the optimizer
		"""
		eps = .001
		num_tasks = len(task_types)
		sorted_tasks = sorted(task_types.keys())
		local_task_types = task_types.copy()
		endpoints = sorted_tasks

deep_chem/models/deep3d.py

+12 −15

Original line number	Diff line number	Diff line
		@@ -7,7 +7,7 @@ from keras.models import Sequential
		from keras.layers.core import Dense, Dropout, Activation, Flatten
		from keras.layers.convolutional import Convolution3D, MaxPooling3D

		def fit_3D_convolution(train_data, task_types, **training_params):
		def fit_3D_convolution(train_data, **training_params):
		"""
		Perform stochastic gradient descent for a 3D CNN.
		"""
		@@ -17,7 +17,6 @@ def fit_3D_convolution(train_data, task_types, **training_params):
		raise ValueError("3D Convolutions only supported for singletask.")
		task_name = train_data["sorted_tasks"][0]
		(y_train, _) = train_data["sorted_tasks"].itervalues().next()
		nb_classes = 2
		models[task_name] = train_3D_convolution(X_train, y_train, **training_params)
		return models

		@@ -39,8 +38,6 @@ def train_3D_convolution(X, y, batch_size=50, nb_epoch=1,learning_rate=0.01,
		(n_samples, axis_length, _, _, n_channels) = np.shape(X)
		X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
		print "Final shape of X: " + str(np.shape(X))
		# Number of classes for classification
		nb_classes = 2

		# number of convolutional filters to use at each layer
		nb_filters = [axis_length/2, axis_length, axis_length]
		@@ -53,8 +50,8 @@ def train_3D_convolution(X, y, batch_size=50, nb_epoch=1,learning_rate=0.01,

		model = Sequential()
		model.add(Convolution3D(nb_filter=nb_filters[0], stack_size=n_channels,
		nb_row=nb_conv[0], nb_col=nb_conv[0], nb_depth=nb_conv[0],
		border_mode='valid'))
		nb_row=nb_conv[0], nb_col=nb_conv[0],
		nb_depth=nb_conv[0], border_mode='valid'))
		model.add(Activation('relu'))
		model.add(MaxPooling3D(poolsize=(nb_pool[0], nb_pool[0], nb_pool[0])))
		model.add(Convolution3D(nb_filter=nb_filters[1], stack_size=nb_filters[0],
		@@ -63,18 +60,18 @@ def train_3D_convolution(X, y, batch_size=50, nb_epoch=1,learning_rate=0.01,
		model.add(Activation('relu'))
		model.add(MaxPooling3D(poolsize=(nb_pool[1], nb_pool[1], nb_pool[1])))
		model.add(Convolution3D(nb_filter=nb_filters[2], stack_size=nb_filters[1],
		nb_row=nb_conv[2], nb_col=nb_conv[2], nb_depth=nb_conv[2],
		border_mode='valid'))
		nb_row=nb_conv[2], nb_col=nb_conv[2],
		nb_depth=nb_conv[2], border_mode='valid'))
		model.add(Activation('relu'))
		model.add(MaxPooling3D(poolsize=(nb_pool[2], nb_pool[2], nb_pool[2])))
		model.add(Flatten())
		# TODO(rbharath): If we change away from axis-size 32, this code will break.
		# Eventually figure out a more general rule that works for all axis sizes.
		model.add(Dense(32, 32/2, init='normal'))
		model.add(Dense(32/2, init='normal'))
		model.add(Activation('relu'))
		model.add(Dropout(0.5))
		# TODO(rbharath): Generalize this to support classification as well as regression.
		model.add(Dense(32/2, 1, init='normal'))
		model.add(Dense(1, init='normal'))

		sgd = RMSprop(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
		print "About to compile model"

deep_chem/models/standard.py

+0 −11

Original line number	Diff line number	Diff line
		@@ -69,14 +69,3 @@ def fit_singletask_models(train_data, modeltype):
		model.fit(task_X_train, task_y_train.ravel())
		models[task] = model
		return models

		## TODO(rbharath): I believe this is broken. Update it to work with the rest of
		## the package.
		#def fit_multitask_rf(train_data):
		# """Fits a multitask RF model to provided dataset.
		# """
		# (_, X_train, y_train, _) = train_data
		# model = RandomForestClassifier(
		# n_estimators=100, n_jobs=-1, class_weight="auto")
		# model.fit(X_train, y_train)
		# return model

deep_chem/scripts/modeler.py

+35 −51

Original line number	Diff line number	Diff line
		"""
		Top level script to featurize input, train models, and evaluate them.
		"""
		from __future__ import print_function
		from __future__ import division
		from __future__ import unicode_literals
		import argparse
		import gzip
		import cPickle as pickle
		import joblib
		import os
		from deep_chem.utils.featurize import generate_directories
		from deep_chem.utils.featurize import extract_data
		@@ -12,7 +12,6 @@ from deep_chem.utils.featurize import generate_targets
		from deep_chem.utils.featurize import generate_features
		from deep_chem.utils.featurize import generate_vs_utils_features
		from deep_chem.models.standard import fit_singletask_models
		#from deep_chem.utils.load import get_target_names
		from deep_chem.utils.load import process_datasets
		from deep_chem.utils.load import transform_data
		from deep_chem.utils.evaluate import results_to_csv
		@@ -171,11 +170,6 @@ def add_fit_command(subparsers):
		group.add_argument(
		"--saved-data", required=1,
		help="Location of saved transformed data.")
		# TODO(rbharath): CODE SMELL. This shouldn't be shuttled around
		group.add_argument(
		"--paths", nargs="+", required=1,
		help="Paths to input datasets.")

		add_model_group(fit_cmd)
		group = fit_cmd.add_argument_group("save")
		group.add_argument(
		@@ -195,10 +189,6 @@ def add_eval_command(subparsers):
		help="Location from which to load saved model.")
		group.add_argument(
		"--saved-data", required=1, help="Location of saved transformed data.")
		# TODO(rbharath): CODE SMELL. This shouldn't be shuttled around
		group.add_argument(
		"--paths", nargs="+", required=1,
		help="Paths to input datasets.")
		group.add_argument(
		"--modeltype", required=1,
		choices=["sklearn", "keras-graph", "keras-sequential"],
		@@ -291,8 +281,8 @@ def add_model_command(subparsers):
		def create_model(args):
		"""Creates a model"""
		data_dir = os.path.join(args.out, args.name)
		print "+++++++++++++++++++++++++++++++++"
		print "Perform featurization"
		print("+++++++++++++++++++++++++++++++++")
		print("Perform featurization")
		if not args.skip_featurization:
		_featurize_input(
		args.name, args.out, args.input_file, args.input_type, args.fields,
		@@ -300,8 +290,8 @@ def create_model(args):
		args.smiles_field, args.split_field, args.id_field, args.threshold,
		args.delimiter)

		print "+++++++++++++++++++++++++++++++++"
		print "Perform train-test split"
		print("+++++++++++++++++++++++++++++++++")
		print("Perform train-test split")
		paths = [data_dir]
		weight_positives = False # Hard coding this for now
		train_out = os.path.join(data_dir, "%s-train.joblib" % args.name)
		@@ -312,21 +302,21 @@ def create_model(args):
		args.splittype, weight_positives, args.mode, train_out, test_out,
		args.target_fields)

		print "+++++++++++++++++++++++++++++++++"
		print "Fit model"
		print("+++++++++++++++++++++++++++++++++")
		print("Fit model")
		modeltype = get_model_type(args.model)
		extension = get_model_extension(modeltype)
		saved_out = os.path.join(data_dir, "%s.%s" % (args.model, extension))
		if not args.skip_fit:
		_fit_model(
		paths, args.model, args.task_type, args.n_hidden, args.learning_rate,
		args.model, args.task_type, args.n_hidden, args.learning_rate,
		args.dropout, args.n_epochs, args.decay, args.batch_size, args.loss_function,
		args.validation_split, saved_out, train_out, args.target_fields)


		print "+++++++++++++++++++++++++++++++++"
		print "Eval Model on Train"
		print "-------------------"
		print("+++++++++++++++++++++++++++++++++")
		print("Eval Model on Train")
		print("-------------------")
		csv_out_train = os.path.join(data_dir, "%s-train.csv" % args.name)
		stats_out_train = os.path.join(data_dir, "%s-train-stats.txt" % args.name)
		csv_out_test = os.path.join(data_dir, "%s-test.csv" % args.name)
		@@ -334,24 +324,22 @@ def create_model(args):
		compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef = (
		False, False, False, False)
		compute_r2s, compute_rms = False, False
		print "create_model()"
		print "args.task_type"
		print args.task_type
		print("create_model()")
		print("args.task_type")
		print(args.task_type)
		if args.task_type == "classification":
		compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef = (
		True, True, True, True)
		elif args.task_type == "regression":
		compute_r2s, compute_rms = True, True
		_eval_trained_model(
		modeltype, saved_out, train_out, paths, args.task_type, compute_aucs,
		modeltype, saved_out, train_out, args.task_type, compute_aucs,
		compute_recall, compute_accuracy, compute_matthews_corrcoef, compute_r2s,
		compute_rms, csv_out_train, stats_out_train, args.target_fields)
		print "(compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef, compute_r2s, compute_rms)"
		print (compute_aucs, compute_recall, compute_accuracy, compute_matthews_corrcoef, compute_r2s, compute_rms)
		print "Eval Model on Test"
		print "------------------"
		print("Eval Model on Test")
		print("------------------")
		_eval_trained_model(
		modeltype, saved_out, test_out, paths, args.task_type, compute_aucs,
		modeltype, saved_out, test_out, args.task_type, compute_aucs,
		compute_recall, compute_accuracy, compute_matthews_corrcoef, compute_r2s,
		compute_rms, csv_out_test, stats_out_test, args.target_fields)

		@@ -386,17 +374,17 @@ def _featurize_input(name, out, input_file, input_type, fields, field_types,
		if id_field is None:
		id_field = smiles_field
		out_x_pkl, out_y_pkl = generate_directories(name, out, feature_fields)
		df, mols = extract_data(
		df, _ = extract_data(
		input_file, input_type, fields, field_types, target_fields,
		smiles_field, threshold, delimiter)
		print "Generating targets"
		print("Generating targets")
		generate_targets(df, target_fields, split_field,
		smiles_field, id_field, out_y_pkl)
		print "Generating user-specified features"
		print("Generating user-specified features")
		generate_features(df, feature_fields, smiles_field, id_field, out_x_pkl)
		print "Generating circular fingerprints"
		print("Generating circular fingerprints")
		generate_vs_utils_features(df, name, out, smiles_field, id_field, "fingerprints")
		print "Generating rdkit descriptors"
		print("Generating rdkit descriptors")
		generate_vs_utils_features(df, name, out, smiles_field, id_field, "descriptors")

		def train_test_input(args):
		@@ -418,10 +406,8 @@ def _train_test_input(paths, output_transforms, input_transforms,
		feature_types = feature_types.split(",")
		print("About to process_dataset")
		train_dict, test_dict = process_datasets(
		paths, input_transforms, output_transforms_dict,
		feature_types=feature_types, splittype=splittype,
		weight_positives=weight_positives, mode=mode,
		target_names=target_names)
		paths, feature_types=feature_types, splittype=splittype,
		mode=mode, target_names=target_names)
		print("Finished process_dataset")

		print("Starting transform_data")
		@@ -446,16 +432,15 @@ def fit_model(args):
		"""Wrapper that calls _fit_model with arguments unwrapped."""
		# TODO(rbharath): Bundle these arguments up into a training_params dict.
		_fit_model(
		args.paths, args.model, args.task_type, args.n_hidden,
		args.model, args.task_type, args.n_hidden,
		args.learning_rate, args.dropout, args.n_epochs, args.decay,
		args.batch_size, args.loss_function, args.validation_split,
		args.saved_out, args.saved_data, args.target_fields)

		def _fit_model(paths, model, task_type, n_hidden, learning_rate, dropout,
		def _fit_model(model, task_type, n_hidden, learning_rate, dropout,
		n_epochs, decay, batch_size, loss_function, validation_split, saved_out,
		saved_data, target_names):
		"""Builds model from featurized data."""
		#targets = get_target_names(paths)
		task_types = {target: task_type for target in target_names}

		stored_train = load_sharded_dataset(saved_data)
		@@ -476,7 +461,7 @@ def _fit_model(paths, model, task_type, n_hidden, learning_rate, dropout,
		elif model == "3D_cnn":
		from deep_chem.models.deep3d import fit_3D_convolution
		models = fit_3D_convolution(
		train_dict, task_types, nb_epoch=n_epochs, batch_size=batch_size,
		train_dict, nb_epoch=n_epochs, batch_size=batch_size,
		learning_rate=learning_rate, loss_function=loss_function)
		else:
		models = fit_singletask_models(train_dict, model)
		@@ -507,19 +492,18 @@ def get_model_extension(modeltype):
		def eval_trained_model(args):
		"""Wrapper function that calls _eval_trained_model with unwrapped args."""
		_eval_trained_model(
		args.modeltype, args.saved_model, args.saved_data, args.paths,
		args.modeltype, args.saved_model, args.saved_data,
		args.task_type, args.compute_aucs, args.compute_recall,
		args.compute_accuracy, args.compute_matthews_corrcoef, args.compute_r2s,
		args.compute_rms, args.csv_out, args.stats_out,
		args.target_fields)

		def _eval_trained_model(modeltype, saved_model, saved_data, paths, task_type,
		def _eval_trained_model(modeltype, saved_model, saved_data, task_type,
		compute_aucs, compute_recall, compute_accuracy,
		compute_matthews_corrcoef, compute_r2s, compute_rms,
		csv_out, stats_out, target_names):
		"""Evaluates a trained model on specified data."""
		model = load_model(modeltype, saved_model)
		#targets = get_target_names(paths)
		task_types = {target: task_type for target in target_names}

		stored_test = load_sharded_dataset(saved_data)
		@@ -534,7 +518,7 @@ def _eval_trained_model(modeltype, saved_model, saved_data, paths, task_type,
		recall=compute_recall, accuracy=compute_accuracy,
		mcc=compute_matthews_corrcoef, print_file=stats_file)
		with open(stats_out, "r") as stats_file:
		print stats_file.read()
		print(stats_file.read())
		results_to_csv(results, csv_out, task_type=task_type)

		def main():

deep_chem/utils/analysis.py

deleted100644 → 0

+0 −128

Original line number	Diff line number	Diff line
		"""
		Utility functions to compare datasets to one another.
		"""
		__author__ = "Bharath Ramsundar"
		__copyright__ = "Copyright 2015, Stanford University"
		__license__ = "LGPL"

		import numpy as np

		def results_to_csv(results_dict):
		"""Pretty prints results as CSV line."""
		targets = sorted(results_dict.keys())
		print ",".join(targets)
		print ",".join([str(results_dict[target]) for target in targets])

		def summarize_distribution(y):
		"""Analyzes regression dataset.

		Parameters
		----------
		y: np.ndarray
		A 1D numpy array containing distribution.
		"""
		mean = np.mean(y)
		std = np.std(y)
		minval = np.amin(y)
		maxval = np.amax(y)
		hist = np.histogram(y)
		print "Mean: %f" % mean
		print "Std: %f" % std
		print "Min: %f" % minval
		print "Max: %f" % maxval
		print "Histogram: "
		print hist

		#def analyze_data(dataset, splittype="random"):
		# """Analyzes regression dataset.
		#
		# Parameters
		# ----------
		# dataset: dict
		# A dictionary of type produced by load_datasets.
		# splittype: string
		# Type of split for train/test. Either random or scaffold.
		# """
		# singletask = multitask_to_singletask(dataset)
		# for target in singletask:
		# data = singletask[target]
		# if len(data.keys()) == 0:
		# continue
		# if splittype == "random":
		# train, test = train_test_random_split(data, seed=0)
		# elif splittype == "scaffold":
		# train, test = train_test_scaffold_split(data)
		# else:
		# raise ValueError("Improper splittype. Must be random/scaffold.")
		# _, Xtrain, ytrain, _ = dataset_to_numpy(train)
		# # TODO(rbharath): Take this out once debugging is completed
		# ytrain = np.log(ytrain)
		# mean = np.mean(ytrain)
		# std = np.std(ytrain)
		# minval = np.amin(ytrain)
		# maxval = np.amax(ytrain)
		# hist = np.histogram(ytrain)
		# print target
		# print "Mean: %f" % mean
		# print "Std: %f" % std
		# print "Min: %f" % minval
		# print "Max: %f" % maxval
		# print "Histogram: "
		# print hist


		def compare_all_datasets():
		"""Compare all datasets in our collection.

		TODO(rbharath): Make this actually robust.
		"""
		muv_path = "/home/rbharath/vs-datasets/muv"
		pcba_path = "/home/rbharath/vs-datasets/pcba"
		dude_path = "/home/rbharath/vs-datasets/dude"
		pfizer_path = "/home/rbharath/private-datasets/pfizer"
		muv_data = load_datasets([muv_path])
		pcba_data = load_datasets([pcba_path])
		dude_data = load_datasets([dude_path])
		pfizer_data = load_datasets([pfizer_path])
		print "----------------------"
		compare_datasets("muv", muv_data, "pcba", pcba_data)
		print "----------------------"
		compare_datasets("pfizer", pfizer_data, "muv", muv_data)
		print "----------------------"
		compare_datasets("pfizer", pfizer_data, "pcba", pcba_data)
		print "----------------------"
		compare_datasets("muv", muv_data, "dude", dude_data)
		print "----------------------"
		compare_datasets("pcba", pcba_data, "dude", dude_data)
		print "----------------------"
		compare_datasets("pfizer", pfizer_data, "dude", dude_data)

		def compare_datasets(first_name, first, second_name, second):
		"""Counts the overlap between two provided datasets.

		Parameters
		----------
		first_name: string
		Name of first dataset
		first: dict
		Data dictionary generated by load_datasets.
		second_name: string
		Name of second dataset
		second: dict
		Data dictionary generated by load_datasets.
		"""
		first_scaffolds = set()
		for key in first:
		_, scaffold, _ = first[key]
		first_scaffolds.add(scaffold)
		print "%d molecules in %s dataset" % (len(first), first_name)
		print "%d scaffolds in %s dataset" % (len(first_scaffolds), first_name)
		second_scaffolds = set()
		for key in second:
		_, scaffold, _ = second[key]
		second_scaffolds.add(scaffold)
		print "%d molecules in %s dataset" % (len(second), second_name)
		print "%d scaffolds in %s dataset" % (len(second_scaffolds), second_name)
		common_scaffolds = first_scaffolds.intersection(second_scaffolds)
		print "%d scaffolds in both" % len(common_scaffolds)

Admin message