Changes required to run PCBA (without fixing CV split issue.) (1ed0ec47) · Commits · 钟慕尧 / deepchem

deep_chem/models/deep.py

+8 −6

Original line number	Diff line number	Diff line
		@@ -52,13 +52,15 @@ def process_multitask(paths, task_transforms, splittype="random",
		else:
		raise ValueError("Improper splittype. Must be random/scaffold.")
		X_train, y_train, W_train = dataset_to_numpy(train)
		if weight_positives:
		print "Train set balance"
		ensure_balanced(y_train, W_train)
		## TODO(rbharath): Still need to fix the failures for PCBA. Temporarily
		## commenting out to experiment.
		#if weight_positives:
		# print "Train set balance"
		# ensure_balanced(y_train, W_train)
		X_test, y_test, W_test = dataset_to_numpy(test)
		if weight_positives:
		print "Test set balance"
		ensure_balanced(y_test, W_test)
		#if weight_positives:
		# print "Test set balance"
		# ensure_balanced(y_test, W_test)
		return (train, X_train, y_train, W_train, test, X_test, y_test, W_test)

		def process_singletask(paths, task_transforms, splittype="random", seed=None,

+6 −1

Original line number	Diff line number	Diff line
		@@ -6,6 +6,7 @@ __copyright__ = "Copyright 2015, Stanford University"
		__license__ = "LGPL"

		import numpy as np
		import warnings
		from deep_chem.utils.preprocess import dataset_to_numpy
		from deep_chem.utils.preprocess import labels_to_weights
		from sklearn.metrics import mean_squared_error
		@@ -142,7 +143,11 @@ def compute_roc_auc_scores(results, task_types):
		print np.shape(ytrue)
		print "np.shape(yscore)"
		print np.shape(yscore)
		try:
		score = roc_auc_score(ytrue, yscore[:,1], sample_weight=sample_weights)
		except Exception as e:
		warnings.warn("ERROR! ROC_AUC_SCORE CALCULATION FAILED.")
		score = 0.5
		#score = roc_auc_score(ytrue, yscore, sample_weight=sample_weights)
		print "Target %s: AUC %f" % (target, score)
		scores[target] = score

+3 −3

Original line number	Diff line number	Diff line
		@@ -251,9 +251,9 @@ def load_and_transform_dataset(paths, task_transforms,
		dataset = load_datasets(paths)
		X, y, W = transform_outputs(dataset, task_transforms,
		weight_positives=weight_positives)
		# TODO(rbharath): Take this out once test passes
		if weight_positives:
		ensure_balanced(y, W)
		## TODO(rbharath): Take this out once test passes
		#if weight_positives:
		# ensure_balanced(y, W)
		trans_data = {}
		sorted_smiles = sorted(dataset.keys())
		sorted_targets = sorted(task_transforms.keys())

+9 −2

Original line number	Diff line number	Diff line
		@@ -94,7 +94,14 @@ def balance_positives(y, W):
		if to_next_target:
		continue
		n_positives, n_negatives = len(positive_inds), len(negative_inds)
		print "For target %d, n_positives: %d, n_negatives: %d" % (target_ind, n_positives, n_negatives)
		print "For target %d, n_positives: %d, n_negatives: %d" % (
		target_ind, n_positives, n_negatives)
		# TODO(rbharath): This results since the coarse train/test split doesn't
		# guarantee that the test set actually has any positives for targets. FIX
		# THIS BEFORE RELEASE!
		if n_positives == 0:
		pos_weight = 0
		else:
		pos_weight = float(n_negatives)/float(n_positives)
		W[positive_inds, target_ind] = pos_weight
		W[negative_inds, target_ind] = 1