More cleanup. Last commit before start of some API surgery. (c2c3cd44) · Commits · 钟慕尧 / deepchem

deep_chem/scripts/modeler.py

+0 −2

Original line number	Diff line number	Diff line
		@@ -249,7 +249,5 @@ def main():
		args = parse_args()
		args.func(args)



		if __name__ == "__main__":
		main()

deep_chem/utils/featurize.py

+0 −27

Original line number	Diff line number	Diff line
		@@ -90,33 +90,6 @@ def generate_vs_util_features(df, name, out, smiles_endpoint, id_endpoint, featu
		with gzip.open(features, "wb") as f:
		pickle.dump(feature_df, f, pickle.HIGHEST_PROTOCOL)

		'''
		def generate_descriptors(df, name, out, smiles_endpoint, id_endpoint):
		"""Generates molecular descriptors for dataset."""
		dataset_dir = os.path.join(out, name)
		descriptors_dir = os.path.join(dataset_dir, "descriptors")
		shards_dir = os.path.join(dataset_dir, "shards")
		descriptors = os.path.join(descriptors_dir,
		"%s-descriptors.pkl.gz" % name)

		descriptors_df = pd.DataFrame([])
		descriptors_df["smiles"] = df[[smiles_endpoint]]
		descriptors_df["scaffolds"] = df[[smiles_endpoint]].apply(
		functools.partial(generate_scaffold, smiles_endpoint=smiles_endpoint),
		axis=1)
		descriptors_df["mol_id"] = df[[id_endpoint]]
		mols = []
		for row in df.iterrows():
		# pandas rows are tuples (row_num, row_data)
		smiles = row[1][smiles_endpoint]
		mols.append(Chem.MolFromSmiles(smiles))
		featurizer = SimpleDescriptors()
		descriptors_df["features"] = pd.DataFrame([ {"features": feature} for feature in featurizer.featurize(mols)])

		with gzip.open(descriptors, "wb") as f:
		pickle.dump(descriptors_df, f, pickle.HIGHEST_PROTOCOL)
		'''

		def get_rows(input_file, input_type, delimiter):
		"""Returns an iterator over all rows in input_file"""
		# TODO(rbharath): This function loads into memory, which can be painful. The

deep_chem/utils/preprocess.py

+0 −50

Original line number	Diff line number	Diff line
		@@ -151,7 +151,6 @@ def dataset_to_numpy(dataset, feature_endpoint="fingerprint",
		n_samples = len(dataset.keys())
		sample_datapoint = dataset.itervalues().next()
		feature_shape = np.shape(sample_datapoint[feature_endpoint])
		print np.shape(feature_shape)

		#n_targets = 1 # TODO(rbharath): Generalize this later
		n_targets = len(sample_datapoint[labels_endpoint])
		@@ -173,59 +172,10 @@ def dataset_to_numpy(dataset, feature_endpoint="fingerprint",
		W[index][t_ind] = 0
		else:
		y[index][t_ind] = labels[target]
		print "DATASET_TO_NUMPY"
		print "np.shape(X)"
		print np.shape(X)
		if weight_positives:
		W = balance_positives(y, W)
		return (X, y, W)

		"""
		def dataset_to_numpy(dataset, feature_endpoint="fingerprint",
		labels_endpoint="labels", weight_positives=True):
		'''Transforms a loaded dataset into numpy arrays (X, y).

		Transforms provided dict into feature matrix X (of dimensions [n_samples,
		n_features]) and label matrix y (of dimensions [n_samples,
		n_targets+n_desc]), where n_targets is the number of assays in the
		provided datset and n_desc is the number of computed descriptors we'd
		like to predict.

		Note that this function transforms missing data into negative examples
		(this is relatively safe since the ratio of positive to negative examples
		is on the order 1/100)

		Parameters
		----------
		dataset: dict
		A dictionary of type produced by load_datasets.
		'''
		n_samples = len(dataset.keys())
		sample_datapoint = dataset.itervalues().next()
		n_features = np.size(sample_datapoint[feature_endpoint])
		n_targets = len(sample_datapoint[labels_endpoint])
		X = np.zeros((n_samples, n_features))
		y = np.zeros((n_samples, n_targets))
		W = np.ones((n_samples, n_targets))
		sorted_smiles = sorted(dataset.keys())
		for index, smiles in enumerate(sorted_smiles):
		datapoint = dataset[smiles]
		fingerprint, labels = (datapoint[feature_endpoint],
		datapoint[labels_endpoint])
		X[index] = np.array(fingerprint).flatten()
		sorted_targets = sorted(labels.keys())
		# Set labels from measurements
		for t_ind, target in enumerate(sorted_targets):
		if labels[target] == -1:
		y[index][t_ind] = -1
		W[index][t_ind] = 0
		else:
		y[index][t_ind] = labels[target]
		if weight_positives:
		W = balance_positives(y, W)
		return X, y, W
		"""

		def multitask_to_singletask(dataset):
		"""Transforms a multitask dataset to a singletask dataset.

Admin message