Changes (62dcf20f) · Commits · 钟慕尧 / deepchem

deepchem/feat/atomic_coordinates.py

+4 −4

Original line number	Diff line number	Diff line
		@@ -175,10 +175,10 @@ class NeighborListComplexAtomicCoordinates(ComplexFeaturizer):

		Parameters
		----------
		mol_pdb: list
		Should be a list of lines of the PDB file.
		complex_pdb: list
		Should be a list of lines of the PDB file.
		mol_pdb_file: Str
		Filename for ligand pdb file.
		protein_pdb_file: Str
		Filename for protein pdb file.
		"""
		mol_coords, ob_mol = rdkit_util.load_molecule(mol_pdb_file)
		protein_coords, protein_mol = rdkit_util.load_molecule(protein_pdb_file)

deepchem/feat/base_classes.py

+7 −5

Original line number	Diff line number	Diff line
		"""
		Feature calculations.
		"""
		import logging
		import types
		import numpy as np
		from rdkit import Chem
		@@ -12,7 +13,8 @@ __copyright__ = "Copyright 2014, Stanford University"
		__license__ = "BSD 3-clause"


		def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file):
		def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message):
		logging.info(log_message)
		return featurizer._featurize_complex(mol_pdb_file, protein_pdb_file)


		@@ -21,7 +23,7 @@ class ComplexFeaturizer(object):
		Abstract class for calculating features for mol/protein complexes.
		"""

		def featurize_complexes(self, mol_files, protein_pdbs, log_every_n=1000):
		def featurize_complexes(self, mol_files, protein_pdbs):
		"""
		Calculate features for mol/protein complexes.

		@@ -42,10 +44,10 @@ class ComplexFeaturizer(object):
		pool = multiprocessing.Pool()
		results = []
		for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_pdbs)):
		log_message = "Featurizing %d / %d" % (
		i, len(mol_files)) if i % log_every_n == 0 else None
		log_message = "Featurizing %d / %d" % (i, len(mol_files))
		results.append(
		pool.apply_async(_featurize_complex, (self, mol_file, protein_pdb)))
		pool.apply_async(_featurize_complex,
		(self, mol_file, protein_pdb, log_message)))
		pool.close()
		features = []
		failures = []

deepchem/feat/rdkit_grid_featurizer.py

+16 −103

Original line number	Diff line number	Diff line
		@@ -29,20 +29,6 @@ TODO(LESWING) add sanitization with rdkit upgrade to 2017.*
		"""


		def get_ligand_filetype(ligand_filename):
		"""Returns the filetype of ligand."""
		if ".mol2" in ligand_filename:
		return "mol2"
		elif ".sdf" in ligand_filename:
		return "sdf"
		elif ".pdbqt" in ligand_filename:
		return "pdbqt"
		elif ".pdb" in ligand_filename:
		return "pdb"
		else:
		raise ValueError("Unrecognized_filename")


		def compute_centroid(coordinates):
		"""Compute the x,y,z centroid of provided coordinates

		@@ -1228,85 +1214,10 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
		]
		raise ValueError('Unknown feature type "%s"' % feature_name)

		def _featurize_complex(self, ligand_ext, ligand_lines, protein_pdb_lines,
		log_message):
		tempdir = tempfile.mkdtemp()
		if log_message is not None:
		log(log_message)
		def _featurize_complex(self, mol_pdb_file, protein_pdb_file):
		"""Computes grid featurization of protein/ligand complex.

		############################################################## TIMING
		time1 = time.time()
		############################################################## TIMING
		ligand_file = os.path.join(tempdir, "ligand.%s" % ligand_ext)
		with open(ligand_file, "w") as mol_f:
		mol_f.writelines(ligand_lines)
		############################################################## TIMING
		time2 = time.time()
		log("TIMING: Writing ligand took %0.3f s" % (time2 - time1), self.verbose)
		############################################################## TIMING

		############################################################## TIMING
		time1 = time.time()
		############################################################## TIMING
		protein_pdb_file = os.path.join(tempdir, "protein.pdb")
		with open(protein_pdb_file, "w") as protein_f:
		protein_f.writelines(protein_pdb_lines)
		############################################################## TIMING
		time2 = time.time()
		log("TIMING: Writing protein took %0.3f s" % (time2 - time1), self.verbose)
		############################################################## TIMING

		features_dict = self._transform(protein_pdb_file, ligand_file)
		shutil.rmtree(tempdir)
		# Handle case the transform failed
		if features_dict is not None:
		return list(features_dict.values())
		else:
		return None

		def featurize_complexes(self, mol_files, protein_pdbs, log_every_n=1000):
		"""
		Calculate features for mol/protein complexes.

		Parameters
		----------
		mols: list
		List of PDB filenames for molecules.
		protein_pdbs: list
		List of PDB filenames for proteins.
		"""
		pool = multiprocessing.Pool()
		results = []
		for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_pdbs)):
		log_message = "Featurizing %d / %d" % (
		i, len(mol_files)) if i % log_every_n == 0 else None
		ligand_ext = get_ligand_filetype(mol_file)
		with open(mol_file) as mol_f:
		mol_lines = mol_f.readlines()
		with open(protein_pdb) as protein_file:
		protein_pdb_lines = protein_file.readlines()
		results.append(
		pool.apply_async(
		_featurize_complex,
		(self, ligand_ext, mol_lines, protein_pdb_lines, log_message)))
		pool.close()
		features = []
		failures = []
		for ind, result in enumerate(results):
		new_features = result.get()
		# Handle loading failures which return None
		if new_features is not None:
		features.append(new_features)
		else:
		failures.append(ind)
		features = np.asarray(features)
		return features, failures

		def _transform(self, protein_pdb, ligand_file):
		"""Computes featurization of protein/ligand complex.

		Takes as input files (strings) for pdb of the protein, pdb of the ligand,
		and a directory to save intermediate files.
		Takes as input filenames pdb of the protein, pdb of the ligand.

		This function then computes the centroid of the ligand; decrements this
		centroid from the atomic coordinates of protein and ligand atoms, and then
		@@ -1314,6 +1225,12 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
		saved.

		This function then computes a featurization with scheme specified by the user.
		Parameters
		----------
		mol_pdb_file: Str
		Filename for ligand pdb file.
		protein_pdb_file: Str
		Filename for protein pdb file.
		"""
		try:
		############################################################## TIMING
		@@ -1321,7 +1238,7 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
		############################################################## TIMING

		protein_xyz, protein_rdk = load_molecule(
		protein_pdb, calc_charges=True, sanitize=self.sanitize)
		protein_pdb_file, calc_charges=True, sanitize=self.sanitize)
		############################################################## TIMING
		time2 = time.time()
		log("TIMING: Loading protein coordinates took %0.3f s" % (time2 - time1),
		@@ -1331,7 +1248,7 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
		time1 = time.time()
		############################################################## TIMING
		ligand_xyz, ligand_rdk = load_molecule(
		ligand_file, calc_charges=True, sanitize=self.sanitize)
		mol_pdb_file, calc_charges=True, sanitize=self.sanitize)
		############################################################## TIMING
		time2 = time.time()
		log("TIMING: Loading ligand coordinates took %0.3f s" % (time2 - time1),
		@@ -1362,7 +1279,7 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
		rotated_system = rotate_molecules([protein_xyz, ligand_xyz])
		transformed_systems[(i + 1, 0)] = rotated_system

		features = {}
		features_dict = {}
		for system_id, (protein_xyz, ligand_xyz) in transformed_systems.items():
		feature_arrays = []
		for is_flat, function_name in self.feature_types:
		@@ -1378,11 +1295,13 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
		feature_arrays += result

		if self.flatten:
		features[system_id] = np.concatenate(
		features_dict[system_id] = np.concatenate(
		[feature_array.flatten() for feature_array in feature_arrays])
		else:
		features[system_id] = np.concatenate(feature_arrays, axis=-1)
		features_dict[system_id] = np.concatenate(feature_arrays, axis=-1)

		# TODO(rbharath): Is this squeeze OK?
		features = np.squeeze(np.array(list(features_dict.values())))
		return features

		def _voxelize(self,
		@@ -1480,9 +1399,3 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
		feature_vector[0] += len(feature_list)

		return feature_vector


		def _featurize_complex(featurizer, ligand_ext, ligand_lines, protein_pdb_lines,
		log_message):
		return featurizer._featurize_complex(ligand_ext, ligand_lines,
		protein_pdb_lines, log_message)

deepchem/feat/tests/test_rdkit_grid_features.py

+39 −34

Original line number	Diff line number	Diff line
		@@ -33,19 +33,19 @@ class TestHelperFunctions(unittest.TestCase):
		'3ws9_protein_fixer_rdkit.pdb')
		self.ligand_file = os.path.join(current_dir, '3ws9_ligand.sdf')

		def test_get_ligand_filetype(self):
		#def test_get_ligand_filetype(self):

		supported_extensions = ['mol2', 'sdf', 'pdb', 'pdbqt']
		# some users might try to read smiles with this function
		unsupported_extensions = ['smi', 'ism']
		# supported_extensions = ['mol2', 'sdf', 'pdb', 'pdbqt']
		# # some users might try to read smiles with this function
		# unsupported_extensions = ['smi', 'ism']

		for extension in supported_extensions:
		fname = 'molecule.%s' % extension
		self.assertEqual(rgf.get_ligand_filetype(fname), extension)
		# for extension in supported_extensions:
		# fname = 'molecule.%s' % extension
		# self.assertEqual(rgf.get_ligand_filetype(fname), extension)

		for extension in unsupported_extensions:
		fname = 'molecule.%s' % extension
		self.assertRaises(ValueError, rgf.get_ligand_filetype, fname)
		# for extension in unsupported_extensions:
		# fname = 'molecule.%s' % extension
		# self.assertRaises(ValueError, rgf.get_ligand_filetype, fname)

		def test_load_molecule(self):
		# adding hydrogens and charges is tested in dc.utils
		@@ -367,7 +367,8 @@ class TestFeaturizationFunctions(unittest.TestCase):
		prot_xyz,
		prot_rdk,
		lig_xyz,
		lig_rdk,)
		lig_rdk,
		)
		prot_dict_dist, lig_dict_dist = rgf.featurize_binding_pocket_ecfp(
		prot_xyz, prot_rdk, lig_xyz, lig_rdk, pairwise_distances=distance)
		# ...but first check if we actually got two dicts
		@@ -383,13 +384,15 @@ class TestFeaturizationFunctions(unittest.TestCase):
		prot_rdk,
		lig_xyz,
		lig_rdk,
		cutoff=2.0,)
		cutoff=2.0,
		)
		prot_dict_d6, lig_dict_d6 = rgf.featurize_binding_pocket_ecfp(
		prot_xyz,
		prot_rdk,
		lig_xyz,
		lig_rdk,
		cutoff=6.0,)
		cutoff=6.0,
		)
		self.assertLess(len(prot_dict_d2), len(prot_dict))
		# ligands are typically small so all atoms might be present
		self.assertLessEqual(len(lig_dict_d2), len(lig_dict))
		@@ -402,7 +405,8 @@ class TestFeaturizationFunctions(unittest.TestCase):
		prot_rdk,
		lig_xyz,
		lig_rdk,
		ecfp_degree=3,)
		ecfp_degree=3,
		)
		self.assertNotEqual(prot_dict_e3, prot_dict)
		self.assertNotEqual(lig_dict_e3, lig_dict)

		@@ -419,7 +423,8 @@ class TestFeaturizationFunctions(unittest.TestCase):
		prot_rdk,
		lig_rdk,
		distance,
		bins,)
		bins,
		)

		self.assertIsInstance(splif_dict, dict)
		for (prot_idx, lig_idx), ecfp_pair in splif_dict.items():
		@@ -478,7 +483,7 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
		# test if default parameters work
		featurizer = rgf.RdkitGridFeaturizer()
		self.assertIsInstance(featurizer, rgf.RdkitGridFeaturizer)
		feature_tensor = featurizer.featurize_complexes([self.ligand_file],
		feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
		[self.protein_file])
		self.assertIsInstance(feature_tensor, np.ndarray)

		@@ -490,7 +495,7 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
		ecfp_power=9,
		splif_power=9,
		flatten=True)
		feature_tensor = featurizer.featurize_complexes([self.ligand_file],
		feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
		[self.protein_file])
		self.assertIsInstance(feature_tensor, np.ndarray)

		@@ -499,7 +504,7 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
		featurizer = rgf.RdkitGridFeaturizer(
		feature_types=['ecfp_hashed'], flatten=False)
		featurizer.flatten = True # False should be ignored with ecfp_hashed
		feature_tensor = featurizer.featurize_complexes([self.ligand_file],
		feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
		[self.protein_file])
		self.assertIsInstance(feature_tensor, np.ndarray)
		self.assertEqual(feature_tensor.shape, (1, 2 * 2**featurizer.ecfp_power))
		@@ -516,13 +521,13 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
		splif_power=splif_power,
		flatten=False,
		sanitize=True)
		feature_tensor = featurizer.featurize_complexes([self.ligand_file],
		feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
		[self.protein_file])
		self.assertIsInstance(feature_tensor, np.ndarray)
		voxel_total_len = (
		2**ecfp_power +
		len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power +
		len(featurizer.cutoffs['hbond_dist_bins']) + 5)
		len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power + len(
		featurizer.cutoffs['hbond_dist_bins']) + 5)
		self.assertEqual(feature_tensor.shape, (1, 20, 20, 20, voxel_total_len))

		# test flat features
		@@ -532,13 +537,13 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
		ecfp_power=ecfp_power,
		splif_power=splif_power,
		sanitize=True)
		feature_tensor = featurizer.featurize_complexes([self.ligand_file],
		feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
		[self.protein_file])
		self.assertIsInstance(feature_tensor, np.ndarray)
		flat_total_len = (
		3 * 2**ecfp_power +
		len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power +
		len(featurizer.cutoffs['hbond_dist_bins']))
		len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power + len(
		featurizer.cutoffs['hbond_dist_bins']))
		self.assertEqual(feature_tensor.shape, (1, flat_total_len))

		# check if aromatic features are ignores if sanitize=False
		@@ -552,7 +557,7 @@ class TestRdkitGridFeaturizer(unittest.TestCase):

		self.assertTrue('pi_stack' not in featurizer.feature_types)
		self.assertTrue('cation_pi' not in featurizer.feature_types)
		feature_tensor = featurizer.featurize_complexes([self.ligand_file],
		feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
		[self.protein_file])
		self.assertIsInstance(feature_tensor, np.ndarray)
		total_len = voxel_total_len + flat_total_len - 3 - 2**ecfp_power
		@@ -580,9 +585,9 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
		feature_types=['voxel_combined'],
		flatten=False,
		sanitize=True)
		feature_tensors = featurizer.featurize_complexes([self.ligand_file],
		feature_tensors, _ = featurizer.featurize_complexes([self.ligand_file],
		[self.protein_file])
		self.assertEqual(len(feature_tensors), 4)
		self.assertEqual(feature_tensors.shape, (1, 4, 16, 16, 16, 40))

		def test_voxelize(self):
		prot_xyz, prot_rdk = rgf.load_molecule(self.protein_file)

deepchem/molnet/load_function/pdbbind_datasets.py

+5 −2

Original line number	Diff line number	Diff line
		@@ -234,13 +234,16 @@ def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
		else:
		raise ValueError("Featurizer not supported")
		print("Featurizing Complexes")
		features, failures = featurizer.featurize_complexes(
		ligand_files, protein_files, log_every_n=1)
		features, failures = featurizer.featurize_complexes(ligand_files,
		protein_files)
		# Delete labels for failing elements
		labels = np.delete(labels, failures)
		dataset = deepchem.data.DiskDataset.from_numpy(features, labels)
		# No transformations of data
		transformers = []
		if split == None:
		return tasks, (dataset, None, None), transformers

		# TODO(rbharath): This should be modified to contain a cluster split so
		# structures of the same protein aren't in both train/test
		splitters = {

Admin message