Next batch of changes required for 3D CNNs (50e9071d) · Commits · 钟慕尧 / deepchem

deep_chem/models/deep.py

+2 −3

Original line number	Diff line number	Diff line
		@@ -97,8 +97,7 @@ def process_singletask(paths, task_transforms, splittype="random", seed=None,
		raise ValueError("Improper splittype. Must be random/scaffold.")
		X_train, y_train, W_train = dataset_to_numpy(train)
		X_test, y_test, W_test = dataset_to_numpy(test)
		arrays[target] = (train, X_train, y_train, W_train, test, X_test, y_test,
		W_test)
		arrays[target] = (train, X_train, y_train, W_train), (test, X_test, y_test, W_test)
		return arrays


		@@ -121,7 +120,7 @@ def fit_multitask_mlp(paths, task_types, task_transforms,
		training_params: dict
		Aggregates keyword parameters to pass to train_multitask_model
		"""
		(train, X_train, y_train, W_train, test, X_test, y_test, W_test) = (
		(train, X_train, y_train, W_train), (test, X_test, y_test, W_test) = (
		process_multitask(paths, task_transforms, splittype=splittype,
		weight_positives=weight_positives))
		print np.shape(y_train)

deep_chem/models/deep3d.py

+21 −16

Original line number	Diff line number	Diff line
		"""
		Code for training 3D convolutions.
		"""
		from deep_chem.datasets.shapes_3d import load_data
		import numpy as np
		from keras.optimizers import RMSprop
		from keras.models import Sequential
		from keras.layers.core import Dense, Dropout, Activation, Flatten
		from keras.layers.convolutional import Convolution3D, MaxPooling3D
		from keras.utils import np_utils
		import numpy as np
		from deep_chem.utils.preprocess import train_test_random_split
		from deep_chem.utils.load import load_and_transform_dataset
		from deep_chem.utils.preprocess import tensor_dataset_to_numpy
		from deep_chem.datasets.shapes_3d import load_data

		# TODO(rbharath): Factor this out into a separate function in utils. Duplicates
		# code in deep.py
		def process_3D_convolutions(paths, task_transforms, splittype="random"):
		def process_3D_convolutions(paths, task_transforms, seed=None, splittype="random"):
		"""Loads 3D Convolution datasets.

		Parameters
		@@ -19,24 +22,26 @@ def process_3D_convolutions(paths, task_transforms, splittype="random"):
		paths: list
		List of paths to convolution datasets.
		"""
		dataset = load_and_transform_dataset(paths, task_transforms)
		dataset = load_and_transform_dataset(paths, task_transforms, datatype="pdbbind")
		# TODO(rbharath): Factor this code splitting out into a util function.
		if splittype == "random":
		train, test = train_test_random_split(dataset, seed=seed)
		elif splittype == "scaffold":
		train, test = train_test_scaffold_split(dataset)
		X_train, y_train, W_train = dataset_to_numpy(train)
		X_test, y_test, W_test = dataset_to_numpy(test)
		X_train, y_train, W_train = tensor_dataset_to_numpy(train)
		X_test, y_test, W_test = tensor_dataset_to_numpy(test)
		return (X_train, y_train, W_train), (X_test, y_test, W_test)

		def fit_3D_convolution(axis_length=32, **training_params):
		def fit_3D_convolution(paths, task_types, task_transforms, axis_length=32, **training_params):
		"""
		Perform stochastic gradient descent for a 3D CNN.
		"""
		# TODO(rbharath): task_types is not yet used below.
		(X_train, y_train, W_train), (X_test, y_test, W_test) = process_3D_convolutions(
		paths, task_transforms)
		nb_classes = 2
		(X_train, y_train), (X_test, y_test) = load_data(axis_length=axis_length)
		y_train = np_utils.to_categorical(y_train, nb_classes)
		y_test = np_utils.to_categorical(y_test, nb_classes)
		print "np.shape(y_train)"
		print np.shape(y_train)
		print "np.shape(X_train): " + str(np.shape(X_train))
		print "np.shape(y_train): " + str(np.shape(y_train))
		train_3D_convolution(X_train, y_train, axis_length, **training_params)
		@@ -66,9 +71,10 @@ def train_3D_convolution(X, y, axis_length=32, batch_size=50, nb_epoch=1):
		nb_conv = [7, 5, 3]

		model = Sequential()
		model.add(Convolution3D(nb_filter=nb_filters[0], stack_size=1,
		# TODO(rbharath): Avoid hard coding the number of stacks here
		model.add(Convolution3D(nb_filter=nb_filters[0], stack_size=3,
		nb_row=nb_conv[0], nb_col=nb_conv[0], nb_depth=nb_conv[0],
		border_mode='valid'))
		border_mode='valid', input_shape=(32, 32, 32, 3)))
		model.add(Activation('relu'))
		model.add(MaxPooling3D(poolsize=(nb_pool[0], nb_pool[0], nb_pool[0])))
		model.add(Convolution3D(nb_filter=nb_filters[1], stack_size=nb_filters[0],
		@@ -85,11 +91,10 @@ def train_3D_convolution(X, y, axis_length=32, batch_size=50, nb_epoch=1):
		model.add(Dense(320, 32/2, init='normal'))
		model.add(Activation('relu'))
		model.add(Dropout(0.5))
		model.add(Dense(32/2, nb_classes, init='normal'))
		model.add(Activation('softmax'))
		# TODO(rbharath): Generalize this to support classification as well as regression.
		model.add(Dense(32/2, 1, init='normal'))

		sgd = RMSprop(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
		model.compile(loss='categorical_crossentropy', optimizer=sgd)
		model.compile(loss='mean_squared_error', optimizer=sgd)
		model.fit(X, y, batch_size=batch_size, nb_epoch=nb_epoch)

		return model

deep_chem/scripts/process_dataset.py

+2 −1

Original line number	Diff line number	Diff line
		@@ -146,7 +146,8 @@ def generate_targets(input_file, input_type, columns, column_types, out_pkl,
		# TODO(rbharath): This patch is only in place until the smiles/sequence
		# support is fixed.
		if row["smiles"] is None:
		mol = Chem.MolFromSmiles("C")
		# This multiplication kludge guarantees unique smiles.
		mol = Chem.MolFromSmiles("C"*row_index)
		else:
		mol = Chem.MolFromSmiles(row["smiles"])
		row["smiles"] = smiles.get_smiles(mol)

deep_chem/utils/load.py

+49 −18

Original line number	Diff line number	Diff line
		@@ -77,6 +77,10 @@ def load_molecules(paths, dir_name="fingerprints"):
		Returns a dictionary that maps smiles strings to dicts that contain
		fingerprints, smiles strings, scaffolds, mol_ids.

		TODO(rbharath): This function assumes that all datapoints are uniquely keyed
		by smiles strings. This doesn't hold true for the pdbbind dataset. Need to find
		a more general indexing mechanism.

		Parameters
		----------
		paths: list
		@@ -100,6 +104,29 @@ def load_molecules(paths, dir_name="fingerprints"):
		"mol_id": mol_ids[mol]}
		return molecules

		def load_pdbbind_molecules(paths, dir_name="fingerprints"):
		"""Load dataset fingerprints and return fingerprints.
		"""
		# TODO(rbharath): This is a total kludge. Clean up later.
		dir_name = "targets"
		molecules = {}
		for dataset_path in paths:
		pickle_dir = os.path.join(dataset_path, dir_name)
		pickle_files = os.listdir(pickle_dir)
		if len(pickle_files) == 0:
		raise ValueError("No Pickle Files found to load molecules")
		for pickle_file in pickle_files:
		with gzip.open(os.path.join(pickle_dir, pickle_file), "rb") as f:
		contents = pickle.load(f)
		smiles, fingerprints, scaffolds, mol_ids = (
		contents["smiles"], contents["features"],
		None, None)
		for mol in range(len(contents["smiles"])):
		molecules[smiles[mol]] = {"fingerprint": fingerprints[mol],
		"scaffold": None,
		"mol_id": None}
		return molecules

		def get_target_names(paths, target_dir_name="targets"):
		"""Get names of targets in provided collections.

		@@ -121,7 +148,7 @@ def load_assays(paths, target_dir_name="targets"):

		Returns a dictionary that maps smiles strings to label vectors.

		TODO(rbharath): Simplify this function to only support the new pickle format.
		TODO(rbharath): Remove the use of smiles as unique identifier

		Parameters
		----------
		@@ -181,27 +208,30 @@ def load_datasets(paths, datatype="vs", **load_args):
		else:
		raise ValueError("Unsupported datatype.")

		def load_pdbbind_datasets(pdbbind_paths):
		def load_pdbbind_datasets(paths, target_dir_name="targets",
		fingerprint_dir_name="fingerprints"):
		"""Load pdbbind datasets.

		TODO(rbharath): This uses smiles as unique identifier. FIX BEFORE RELEASE!

		Parameters
		----------
		pdbbind_path: list
		List of Pdbbind data files.
		"""
		data = []
		for pdbbind_path in pdbbind_paths:
		with open(pdbbind_path, "rb") as csvfile:
		reader = csv.reader(csvfile)
		for row_ind, row in enumerate(reader):
		if row_ind == 0:
		data = {}
		molecules = load_pdbbind_molecules(paths)
		labels = load_assays(paths, target_dir_name)
		# TODO(rbharath): Why are there fewer descriptors than labels at times?
		# What accounts for the descrepency. Please investigate.
		for ind, smiles in enumerate(molecules):
		if smiles not in labels:
		continue
		data.append({
		"label": row[0],
		"features": row[1],
		})
		df = pd.DataFrame(data)
		return df
		mol = molecules[smiles]
		data[ind] = {"fingerprint": mol["fingerprint"],
		"scaffold": mol["scaffold"],
		"labels": labels[smiles]}
		return data

		def load_vs_datasets(paths, target_dir_name="targets",
		fingerprint_dir_name="fingerprints"):
		@@ -242,7 +272,8 @@ def ensure_balanced(y, W):
		assert np.isclose(pos_weight, neg_weight)

		def load_and_transform_dataset(paths, task_transforms,
		labels_endpoint="labels", weight_positives=True):
		labels_endpoint="labels", weight_positives=True,
		datatype="vs"):
		"""Transform data labels as specified

		Parameters
		@@ -255,9 +286,9 @@ def load_and_transform_dataset(paths, task_transforms,
		are performed in the order specified. An empty list corresponds to no
		transformations. Only for regression outputs.
		"""
		dataset = load_datasets(paths)
		dataset = load_datasets(paths, datatype=datatype)
		X, y, W = transform_outputs(dataset, task_transforms,
		weight_positives=weight_positives)
		weight_positives=weight_positives, datatype=datatype)
		## TODO(rbharath): Take this out once test passes
		#if weight_positives:
		# ensure_balanced(y, W)

deep_chem/utils/preprocess.py

+30 −2

Original line number	Diff line number	Diff line
		@@ -9,7 +9,8 @@ import numpy as np
		import warnings
		from deep_chem.utils.analysis import summarize_distribution

		def transform_outputs(dataset, task_transforms, weight_positives=True):
		def transform_outputs(dataset, task_transforms, weight_positives=True,
		datatype="pdbbind"):
		"""Tranform the provided outputs

		Parameters
		@@ -22,7 +23,10 @@ def transform_outputs(dataset, task_transforms, weight_positives=True):
		performed in the order specified. An empty list
		corresponds to no transformations. Only for regression outputs.
		"""
		if datatype == "vs":
		X, y, W = dataset_to_numpy(dataset, weight_positives=weight_positives)
		elif datatype == "pdbbind":
		X, y, W = tensor_dataset_to_numpy(dataset)
		sorted_targets = sorted(task_transforms.keys())
		endpoints = sorted_targets
		transforms = task_transforms.copy()
		@@ -109,6 +113,30 @@ def balance_positives(y, W):
		W[negative_inds, target_ind] = 1
		return W

		def tensor_dataset_to_numpy(dataset, feature_endpoint="fingerprint",
		labels_endpoint="labels"):
		"""Transforms a set of tensor data into numpy arrays (X, y)"""
		print "Entering tensor_dataset_to_numpy"
		n_samples = len(dataset.keys())
		sample_datapoint = dataset.itervalues().next()
		feature_shape = np.shape(sample_datapoint[feature_endpoint])
		n_targets = 1 # TODO(rbharath): Generalize this later
		X = np.zeros((n_samples,) + feature_shape)
		y = np.zeros((n_samples, n_targets))
		W = np.ones((n_samples, n_targets))
		print "np.shape(X), np.shape(y), np.shape(W)"
		print np.shape(X), np.shape(y), np.shape(W)
		sorted_ids = sorted(dataset.keys())
		for index, smiles in enumerate(dataset.keys()):
		datapoint = dataset[smiles]
		fingerprint, labels = (datapoint[feature_endpoint],
		datapoint[labels_endpoint])
		X[index] = fingerprint
		# TODO(rbharath): The label is a dict for some reason?!? Figure this out
		# and fix it.
		y[index] = labels["3d_core_pdbbind"]
		return (X, y, W)

		def dataset_to_numpy(dataset, feature_endpoint="fingerprint",
		labels_endpoint="labels", weight_positives=True):
		"""Transforms a loaded dataset into numpy arrays (X, y).

Admin message