Formatting (57bcc4d4) · Commits · 钟慕尧 / deepchem

examples/pdbbind/pdbbind_datasets.py

+37 −13

Original line number	Diff line number	Diff line
		@@ -15,10 +15,27 @@ import re
		from rdkit import Chem
		import deepchem as dc


		def load_pdbbind_labels(labels_file):
		"""Loads pdbbind labels as dataframe"""
		# Some complexes have labels but no PDB files. Filter these manually
		missing_pdbs = ["1d2v", "1jou", "1s8j", "3f39", "3i3d", "3i3b", "3dyo", "3t0d", "1cam", "3vdb", "3f37", "3f38", "4mlt", "3f36", "4o7d", "3t08", "3f34", "3f35", "2wik", "4mlx", "2wij", "1px4", "4wkt", "3f33", "2wig", "3muz", "3t2p", "3t2q", "4pji", "2adj", "3t09", "3mv0", "1pts", "3vd9", "3axk", "4q1s", "3t0b", "4b82", "3vd7", "3hg1", "3vd4", "3vdc", "3b5y", "4oi6", "3axm", "4mdm", "2mlm", "3eql", "4ob0", "3wi6", "4fgt", "4pnc", "4mvn", "4lv3", "4lz9", "1pyg", "3h1k", "7gpb", "1e8h", "4wku", "2f2h", "1zyr", "1z9j", "3b5d", "3b62", "4q3q", "4mdl", "4no6", "4mdg", "3dxj", "4u0x", "4l6q", "4q3r", "1h9s", "4ob1", "4ob2", "4qq5", "4nk3", "3k1j", "4m8t", "4mzo", "4nnn", "4q3s", "4nnw", "3cf1", "4u5t", "4wkv", "4ool", "3a2c", "4wm9", "4pkb", "4qkx", "4no8", "1ztz", "1nu1", "4kn4", "4mao", "4qqc", "4len", "4lv1", "4r02", "4r6v", "4fil", "4q2k", "1hpb", "4oon", "4qbb", "4ruu", "4no1", "3w8o", "4kn7", "4r17", "4r18", "5hvp", "1e59", "1sqq", "3n75", "4kmu", "4mzs", "1sqb", "1lr8", "4lv2", "4wmc", "1sqp", "3whw", "4cpa", "3i8w", "4hrd", "4hrc", "1ntk", "1rbo"]
		missing_pdbs = [
		"1d2v", "1jou", "1s8j", "3f39", "3i3d", "3i3b", "3dyo", "3t0d", "1cam",
		"3vdb", "3f37", "3f38", "4mlt", "3f36", "4o7d", "3t08", "3f34", "3f35",
		"2wik", "4mlx", "2wij", "1px4", "4wkt", "3f33", "2wig", "3muz", "3t2p",
		"3t2q", "4pji", "2adj", "3t09", "3mv0", "1pts", "3vd9", "3axk", "4q1s",
		"3t0b", "4b82", "3vd7", "3hg1", "3vd4", "3vdc", "3b5y", "4oi6", "3axm",
		"4mdm", "2mlm", "3eql", "4ob0", "3wi6", "4fgt", "4pnc", "4mvn", "4lv3",
		"4lz9", "1pyg", "3h1k", "7gpb", "1e8h", "4wku", "2f2h", "1zyr", "1z9j",
		"3b5d", "3b62", "4q3q", "4mdl", "4no6", "4mdg", "3dxj", "4u0x", "4l6q",
		"4q3r", "1h9s", "4ob1", "4ob2", "4qq5", "4nk3", "3k1j", "4m8t", "4mzo",
		"4nnn", "4q3s", "4nnw", "3cf1", "4u5t", "4wkv", "4ool", "3a2c", "4wm9",
		"4pkb", "4qkx", "4no8", "1ztz", "1nu1", "4kn4", "4mao", "4qqc", "4len",
		"4lv1", "4r02", "4r6v", "4fil", "4q2k", "1hpb", "4oon", "4qbb", "4ruu",
		"4no1", "3w8o", "4kn7", "4r17", "4r18", "5hvp", "1e59", "1sqq", "3n75",
		"4kmu", "4mzs", "1sqb", "1lr8", "4lv2", "4wmc", "1sqp", "3whw", "4cpa",
		"3i8w", "4hrd", "4hrc", "1ntk", "1rbo"
		]
		contents = []
		with open(labels_file) as f:
		for line in f:
		@@ -40,15 +57,16 @@ def load_pdbbind_labels(labels_file):
		"ignore-this-field", "reference", "ligand name"))
		return contents_df


		def compute_pdbbind_features(grid_featurizer, pdb_subdir, pdb_code):
		"""Compute features for a given complex"""
		protein_file = os.path.join(pdb_subdir, "%s_protein.pdb" % pdb_code)
		ligand_file = os.path.join(pdb_subdir, "%s_ligand.sdf" % pdb_code)
		features = grid_featurizer.featurize_complexes(
		[ligand_file], [protein_file])
		features = grid_featurizer.featurize_complexes([ligand_file], [protein_file])
		features = np.squeeze(features)
		return features


		def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):
		"""Featurizes pdbbind according to provided featurization"""
		tasks = ["-logKd/Ki"]
		@@ -77,19 +95,22 @@ def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):
		# Define featurizers
		if feat == "grid":
		featurizer = dc.feat.GridFeaturizer(
		voxel_width=16.0, feature_types="voxel_combined",
		voxel_width=16.0,
		feature_types="voxel_combined",
		# TODO(rbharath, enf, leswing): Figure out why pi_stack and cation_pi
		# reduce validation performance
		# voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi",
		# "salt_bridge"], ecfp_power=9, splif_power=9,
		voxel_feature_types=["ecfp", "splif", "hbond", "salt_bridge"],
		ecfp_power=9, splif_power=9,
		parallel=True, flatten=True)
		ecfp_power=9,
		splif_power=9,
		parallel=True,
		flatten=True)
		elif feat == "coord":
		neighbor_cutoff = 4
		max_num_neighbors = 10
		featurizer = dc.feat.NeighborListComplexAtomicCoordinates(
		max_num_neighbors, neighbor_cutoff)
		featurizer = dc.feat.NeighborListComplexAtomicCoordinates(max_num_neighbors,
		neighbor_cutoff)
		else:
		raise ValueError("feat not defined.")

		@@ -106,8 +127,8 @@ def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):
		print("%s is missing!" % pdb_subdir)
		missing_pdbs.append(pdb_subdir)
		continue
		computed_feature = compute_pdbbind_features(
		featurizer, pdb_subdir, pdb_code)
		computed_feature = compute_pdbbind_features(featurizer, pdb_subdir,
		pdb_code)
		if feature_len is None:
		feature_len = len(computed_feature)
		if len(computed_feature) != feature_len:
		@@ -126,12 +147,15 @@ def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):
		dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids, data_dir=data_dir)
		return dataset, tasks


		def load_pdbbind_grid(split="index", featurizer="grid", subset="full"):
		"""Load PDBBind datasets. Does not do train/test split"""
		dataset, tasks = featurize_pdbbind(feat=featurizer, subset=subset)

		splitters = {'index': dc.splits.IndexSplitter(),
		'random': dc.splits.RandomSplitter()}
		splitters = {
		'index': dc.splits.IndexSplitter(),
		'random': dc.splits.RandomSplitter()
		}
		splitter = splitters[split]
		train, valid, test = splitter.train_valid_test_split(dataset)

Admin message