style change (e99ed5a0) · Commits · 钟慕尧 / deepchem

deepchem/models/tensorflow_models/lr.py

+48 −50

Original line number	Diff line number	Diff line
		@@ -17,6 +17,7 @@ from deepchem.utils.save import log
		from deepchem.data import pad_features
		from deepchem.metrics import to_one_hot


		def weight_decay(penalty_type, penalty):
		# due to the different shape of weight(ndims=2) and bias(ndims=1),
		# will using this version for logreg
		@@ -40,6 +41,7 @@ def weight_decay(penalty_type, penalty):

		class TensorflowLogisticRegression(TensorflowGraphModel):
		""" A simple tensorflow based logistic regression model. """

		def build(self, graph, name_scopes, training):
		"""Constructs the graph architecture of model: n_tasks * sigmoid nodes.

		@@ -47,15 +49,13 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
		mol_features: Molecule descriptor (e.g. fingerprint) tensor with shape
		batch_size x n_features.
		"""
		placeholder_scope = TensorflowGraph.get_placeholder_scope(
		graph, name_scopes)
		placeholder_scope = TensorflowGraph.get_placeholder_scope(graph,
		name_scopes)
		n_features = self.n_features
		with graph.as_default():
		with placeholder_scope:
		self.mol_features = tf.placeholder(
		tf.float32,
		shape=[None, n_features],
		name='mol_features')
		tf.float32, shape=[None, n_features], name='mol_features')

		weight_init_stddevs = self.weight_init_stddevs
		bias_init_consts = self.bias_init_consts
		@@ -66,22 +66,22 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
		tensor=self.mol_features,
		size=1,
		weight_init=tf.truncated_normal(
		shape=[self.n_features, 1],
		stddev=weight_init_stddevs[0]),
		bias_init=tf.constant(value=bias_init_consts[0],
		shape=[1]))
		shape=[self.n_features, 1], stddev=weight_init_stddevs[0]),
		bias_init=tf.constant(value=bias_init_consts[0], shape=[1]))
		lg_list.append(lg)
		return lg_list

		def add_label_placeholders(self, graph, name_scopes):
		#label placeholders with size batch_size * 1
		labels = []
		placeholder_scope = TensorflowGraph.get_placeholder_scope(graph, name_scopes)
		placeholder_scope = TensorflowGraph.get_placeholder_scope(graph,
		name_scopes)
		with placeholder_scope:
		for task in range(self.n_tasks):
		labels.append(tf.identity(
		tf.placeholder(tf.float32, shape=[None,1],
		name='labels_%d' % task)))
		labels.append(
		tf.identity(
		tf.placeholder(
		tf.float32, shape=[None, 1], name='labels_%d' % task)))
		return labels

		def add_training_cost(self, graph, name_scopes, output, labels, weights):
		@@ -93,8 +93,8 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
		with TensorflowGraph.shared_name_scope('costs', graph, name_scopes):
		for task in range(self.n_tasks):
		task_str = str(task).zfill(len(str(self.n_tasks)))
		with TensorflowGraph.shared_name_scope(
		'cost_{}'.format(task_str), graph, name_scopes):
		with TensorflowGraph.shared_name_scope('cost_{}'.format(task_str),
		graph, name_scopes):
		with tf.name_scope('weighted'):
		weighted_cost = self.cost(output[task], labels[task],
		weights[task])
		@@ -105,12 +105,13 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
		# non-zero weight examples in the batch. Also, instead of using
		# tf.reduce_mean (which can put ops on the CPU) we explicitly
		# calculate with div/sum so it stays on the GPU.
		gradient_cost = tf.div(tf.reduce_sum(weighted_cost),
		self.batch_size)
		gradient_cost = tf.div(
		tf.reduce_sum(weighted_cost), self.batch_size)
		gradient_costs.append(gradient_cost)

		# aggregated costs
		with TensorflowGraph.shared_name_scope('aggregated', graph, name_scopes):
		with TensorflowGraph.shared_name_scope('aggregated', graph,
		name_scopes):
		with tf.name_scope('gradient'):
		loss = tf.add_n(gradient_costs)

		@@ -123,8 +124,8 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
		return loss

		def cost(self, logits, labels, weights):
		return tf.mul(tf.nn.sigmoid_cross_entropy_with_logits(logits, labels),
		weights)
		return tf.mul(
		tf.nn.sigmoid_cross_entropy_with_logits(logits, labels), weights)

		def add_output_ops(self, graph, output):
		# adding output nodes of sigmoid function
		@@ -152,8 +153,7 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
		orig_dict["weights_%d" % task] = w_b[:, task]
		else:
		# Dummy placeholders
		orig_dict["weights_%d" % task] = np.ones(
		(self.batch_size,))
		orig_dict["weights_%d" % task] = np.ones((self.batch_size,))
		return TensorflowGraph.get_feed_dict(orig_dict)

		def predict_proba_on_batch(self, X):
		@@ -172,16 +172,15 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
		# transfer 2D prediction tensor to 2D x n_classes(=2)
		complimentary = np.ones(np.shape(batch_outputs))
		complimentary = complimentary - batch_outputs
		batch_outputs = np.concatenate([complimentary, batch_outputs],
		axis = batch_outputs.ndim-1)
		batch_outputs = np.concatenate(
		[complimentary, batch_outputs], axis=batch_outputs.ndim - 1)
		# reshape to batch_size x n_tasks x ...
		if batch_outputs.ndim == 3:
		batch_outputs = batch_outputs.transpose((1, 0, 2))
		elif batch_outputs.ndim == 2:
		batch_outputs = batch_outputs.transpose((1, 0))
		else:
		raise ValueError(
		'Unrecognized rank combination for output: %s ' %
		raise ValueError('Unrecognized rank combination for output: %s ' %
		(batch_outputs.shape,))

		outputs = batch_outputs
		@@ -209,20 +208,19 @@ class TensorflowLogisticRegression(TensorflowGraphModel):
		# transfer 2D prediction tensor to 2D x n_classes(=2)
		complimentary = np.ones(np.shape(batch_output))
		complimentary = complimentary - batch_output
		batch_output = np.concatenate([complimentary, batch_output],
		axis = batch_output.ndim-1)
		batch_output = np.concatenate(
		[complimentary, batch_output], axis=batch_output.ndim - 1)
		# reshape to batch_size x n_tasks x ...
		if batch_output.ndim == 3:
		batch_output = batch_output.transpose((1, 0, 2))
		elif batch_output.ndim == 2:
		batch_output = batch_output.transpose((1, 0))
		else:
		raise ValueError(
		'Unrecognized rank combination for output: %s' %
		raise ValueError('Unrecognized rank combination for output: %s' %
		(batch_output.shape,))
		output.append(batch_output)

		outputs = np.array(from_one_hot(
		np.squeeze(np.concatenate(output)), axis=-1))
		outputs = np.array(
		from_one_hot(np.squeeze(np.concatenate(output)), axis=-1))

		return np.copy(outputs)

deepchem/trans/transformers.py

+29 −16

Original line number	Diff line number	Diff line
		@@ -661,12 +661,15 @@ class IRVTransformer():
		top_labels = []
		with g_temp.as_default():
		labels_tf = tf.constant(y)
		similarity_placeholder = tf.placeholder(dtype=tf.float64, shape=(None,reference_len))
		value, indice = tf.nn.top_k(similarity_placeholder, k=self.K+1, sorted=True)
		similarity_placeholder = tf.placeholder(
		dtype=tf.float64, shape=(None, reference_len))
		value, indice = tf.nn.top_k(
		similarity_placeholder, k=self.K + 1, sorted=True)
		top_label = tf.gather(labels_tf, indice)
		feed_dict = {}
		for count in range(target_len // 100 + 1):
		feed_dict[similarity_placeholder] = similarity_xs[count100:min((count+1)100, target_len),:]
		feed_dict[similarity_placeholder] = similarity_xs[count * 100:min((
		count + 1) * 100, target_len), :]
		with tf.Session() as sess:
		fetched_values = sess.run([value, top_label], feed_dict=feed_dict)
		values.append(fetched_values[0])
		@@ -675,9 +678,14 @@ class IRVTransformer():
		top_labels = np.concatenate(top_labels, axis=0)
		for count in range(values.shape[0]):
		if values[count, 0] == 1:
		features.append(np.concatenate([values[count, 1:(self.K+1)], top_labels[count, 1:(self.K+1)]]))
		features.append(
		np.concatenate([
		values[count, 1:(self.K + 1)], top_labels[count, 1:(self.K + 1)]
		]))
		else:
		features.append(np.concatenate([values[count, 0:self.K], top_labels[count, 0:self.K]]))
		features.append(
		np.concatenate(
		[values[count, 0:self.K], top_labels[count, 0:self.K]]))
		return features

		def X_transform(self, X_target):
		@@ -701,7 +709,8 @@ class IRVTransformer():
		print('start similarity calculation')
		time1 = time.time()
		similarity = IRVTransformer.matrix_mul(X_target, np.transpose(self.X)) / (
		n_features - IRVTransformer.matrix_mul(1 - X_target, np.transpose(1 - self.X)))
		n_features - IRVTransformer.matrix_mul(1 - X_target,
		np.transpose(1 - self.X)))
		time2 = time.time()
		print('similarity calculation takes %i s' % (time2 - time1))
		for i in range(self.n_tasks):
		@@ -721,8 +730,10 @@ class IRVTransformer():
		for X1_id in range(X1_iter):
		result = np.zeros((1,))
		for X2_id in range(X2_iter):
		partial_result = np.matmul(X1[X1_idshard_size:min((X1_id+1)shard_size, X1_shape[0]),:],
		X2[:, X2_idshard_size:min((X2_id+1)shard_size, X2_shape[1])])
		partial_result = np.matmul(X1[X1_id * shard_size:min((
		X1_id + 1) * shard_size, X1_shape[0]), :],
		X2[:, X2_id * shard_size:min((
		X2_id + 1) * shard_size, X2_shape[1])])
		if result.size == 1:
		result = partial_result
		else:
		@@ -739,7 +750,9 @@ class IRVTransformer():
		X_length = dataset.X.shape[0]
		X_trans = []
		for count in range(X_length // 5000 + 1):
		X_trans.append(self.X_transform(dataset.X[count5000:min((count+1)5000,X_length), :]))
		X_trans.append(
		self.X_transform(dataset.X[count * 5000:min((count + 1) * 5000,
		X_length), :]))
		X_trans = np.concatenate(X_trans, axis=0)
		return NumpyDataset(X_trans, dataset.y, dataset.w, ids=None)

examples/benchmark.py

+6 −3

Original line number	Diff line number	Diff line
		@@ -56,6 +56,7 @@ from sampl.sampl_datasets import load_sampl
		from clintox.clintox_datasets import load_clintox
		from hiv.hiv_datasets import load_hiv


		def benchmark_loading_datasets(hyper_parameters,
		dataset='tox21',
		model='tf',
		@@ -812,8 +813,8 @@ if __name__ == '__main__':
		#irv, rf, rf_regression should be assigned manually
		if len(datasets) == 0:
		datasets = [
		'tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'hiv',
		'sampl', 'delaney', 'nci', 'kaggle', 'pdbbind', 'chembl', 'qm7b'
		'tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'hiv', 'sampl',
		'delaney', 'nci', 'kaggle', 'pdbbind', 'chembl', 'qm7b'
		]

		#input hyperparameters
		@@ -899,7 +900,9 @@ if __name__ == '__main__':

		for split in splitters:
		for dataset in datasets:
		if dataset in ['tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'hiv']:
		if dataset in [
		'tox21', 'sider', 'muv', 'toxcast', 'pcba', 'clintox', 'hiv'
		]:
		for model in models:
		if model in ['tf', 'tf_robust', 'logreg', 'graphconv', 'rf', 'irv']:
		benchmark_loading_datasets(

examples/hiv/hiv_datasets.py

+11 −8

Original line number	Diff line number	Diff line
		@@ -10,13 +10,13 @@ import numpy as np
		import shutil
		import deepchem as dc


		def load_hiv(featurizer='ECFP', split='index'):
		"""Load hiv datasets. Does not do train/test split"""
		# Featurize hiv dataset
		print("About to featurize hiv dataset.")
		current_dir = os.path.dirname(os.path.realpath(__file__))
		dataset_file = os.path.join(
		current_dir, "./HIV.csv")
		dataset_file = os.path.join(current_dir, "./HIV.csv")
		hiv_tasks = ["HIV_active"]
		if featurizer == 'ECFP':
		featurizer_func = dc.feat.CircularFingerprint(size=1024)
		@@ -27,16 +27,19 @@ def load_hiv(featurizer='ECFP', split='index'):
		dataset = loader.featurize(dataset_file, shard_size=8192)
		# Initialize transformers
		transformers = [
		dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
		dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
		]

		print("About to transform data")
		for transformer in transformers:
		dataset = transformer.transform(dataset)

		splitters = {'index': dc.splits.IndexSplitter(),
		splitters = {
		'index': dc.splits.IndexSplitter(),
		'random': dc.splits.RandomSplitter(),
		'scaffold': dc.splits.ScaffoldSplitter(),
		'butina': dc.splits.ButinaSplitter()}
		'butina': dc.splits.ButinaSplitter()
		}
		splitter = splitters[split]
		train, valid, test = splitter.train_valid_test_split(dataset)
		return hiv_tasks, (train, valid, test), transformers

examples/hiv/hiv_irv.py

+1 −4

Original line number	Diff line number	Diff line
		@@ -25,10 +25,7 @@ train_dataset = transformer.transform(train_dataset)
		valid_dataset = transformer.transform(valid_dataset)

		model = dc.models.TensorflowMultiTaskIRVClassifier(
		len(hiv_tasks),
		K=10,
		batch_size=50,
		learning_rate=0.001)
		len(hiv_tasks), K=10, batch_size=50, learning_rate=0.001)

		# Fit trained model
		model.fit(train_dataset)

Admin message