Cleanup. (ee5cccc0) · Commits · 钟慕尧 / deepchem

deep_chem/models/standard.py

+0 −5

Original line number	Diff line number	Diff line
		@@ -28,13 +28,8 @@ def fit_singletask_models(train_data, modeltype):
		None or "log". Only for regression outputs.
		"""
		models = {}
		print "fit_singletask_models()"
		print "train_data.keys()"
		print train_data.keys()
		import numpy as np
		X_train = train_data["features"]
		print "np.shape(X_train)"
		print np.shape(X_train)
		sorted_tasks = train_data["sorted_tasks"]
		for task in sorted_tasks:
		print "Building model for task %s" % task

+1 −4

Original line number	Diff line number	Diff line
		@@ -63,7 +63,6 @@ def generate_vs_utils_features(dataframe, name, out, smiles_field, id_field,
		feature_dir = os.path.join(dataset_dir, featuretype)
		features_file = os.path.join(feature_dir, "%s-%s.pkl.gz" % (name, featuretype))

		print("About to instantiate featurizer.")
		if featuretype == "fingerprints":
		featurizer = CircularFingerprint(size=1024)
		elif featuretype == "descriptors":
		@@ -78,18 +77,16 @@ def generate_vs_utils_features(dataframe, name, out, smiles_field, id_field,
		print("Featurizing molecule %d" % row_ind)
		mol = Chem.MolFromSmiles(row_data)
		features.append(featurizer.featurize([mol]))
		print("Done generating features. About to transfer them to dataframe.")

		feature_df = pd.DataFrame([])
		feature_df["features"] = pd.DataFrame(
		[{"features": feature} for feature in features])

		print("Done transfering to dataframe. About to populate remaining df fields.")
		feature_df["smiles"] = dataframe[[smiles_field]]
		feature_df["scaffolds"] = dataframe[[smiles_field]].apply(
		functools.partial(generate_scaffold, smiles_field=smiles_field),
		axis=1)
		feature_df["mol_id"] = dataframe[[id_field]]
		print("Populated 'smiles', 'scaffolds', 'mol_id' fields")

		print("About to write pkl.gz file")
		with gzip.open(features_file, "wb") as gzip_file:

+4 −4

Original line number	Diff line number	Diff line
		@@ -22,12 +22,12 @@ def process_datasets(paths, feature_types=None, mode="multitask",
		splittype="random", target_names=None):
		"""Extracts datasets and split into train/test.

		Returns a dict with the following keys
		Returns a dict with the following key/value pairs

		"features" -> X
		"mol_ids" -> ids
		features -> X
		mol_ids -> ids
		target -> (y, W)
		"sorted_targets" -> sorted_targets
		sorted_targets -> sorted_targets

		Parameters
		----------

+0 −26

Original line number	Diff line number	Diff line
		@@ -345,29 +345,3 @@ def scaffold_separate(dataset):
		scaffolds[scaffold].append(mol_id)
		# Sort from largest to smallest scaffold sets
		return [elt for (scaffold, elt) in sorted(scaffolds.items(), key=lambda x: -len(x[1]))]

		#def labels_to_weights(ytrue):
		# """Uses the true labels to compute and output sample weights.
		#
		# Parameters
		# ----------
		# ytrue: list or np.ndarray
		# True labels.
		# """
		# n_total = np.shape(ytrue)[0]
		# n_positives = np.sum(ytrue)
		# n_negatives = n_total - n_positives
		# pos_weight = np.floor(n_negatives/n_positives)
		#
		# sample_weights = np.zeros(np.shape(ytrue)[0])
		# for ind, entry in enumerate(ytrue):
		# if entry == 0: # negative
		# sample_weights[ind] = 1
		# elif entry == 1: # positive
		# sample_weights[ind] = pos_weight
		# else:
		# print("labels_to_weights()")
		# print("ytrue")
		# print(ytrue)
		# raise ValueError("ytrue can only contain 0s or 1s.")
		# return sample_weights