Merge pull request #1397 from rbharath/atomic_molnet (b57a23c2) · Commits · 钟慕尧 / deepchem

deepchem/dock/pose_scoring.py

+2 −2

Original line number	Diff line number	Diff line
		@@ -46,7 +46,7 @@ class GridPoseScorer(object):

		def score(self, protein_file, ligand_file):
		"""Returns a score for a protein/ligand pair."""
		features = self.featurizer.featurize_complexes([ligand_file],
		features, _ = self.featurizer.featurize_complexes([ligand_file],
		[protein_file])
		dataset = NumpyDataset(X=features, y=None, w=None, ids=None)
		score = self.model.predict(dataset)

deepchem/feat/atomic_coordinates.py

+30 −12

Original line number	Diff line number	Diff line
		@@ -8,10 +8,13 @@ __author__ = "Joseph Gomes and Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "MIT"

		import logging
		import numpy as np
		from deepchem.utils.save import log
		from deepchem.feat import Featurizer
		from deepchem.feat import ComplexFeaturizer
		from deepchem.utils import rdkit_util, pad_array
		from deepchem.utils.rdkit_util import MoleculeLoadException


		class AtomicCoordinates(Featurizer):
		@@ -172,10 +175,10 @@ class NeighborListComplexAtomicCoordinates(ComplexFeaturizer):

		Parameters
		----------
		mol_pdb: list
		Should be a list of lines of the PDB file.
		complex_pdb: list
		Should be a list of lines of the PDB file.
		mol_pdb_file: Str
		Filename for ligand pdb file.
		protein_pdb_file: Str
		Filename for protein pdb file.
		"""
		mol_coords, ob_mol = rdkit_util.load_molecule(mol_pdb_file)
		protein_coords, protein_mol = rdkit_util.load_molecule(protein_pdb_file)
		@@ -223,8 +226,14 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):
		self.max_num_neighbors, self.neighbor_cutoff)

		def _featurize_complex(self, mol_pdb_file, protein_pdb_file):
		try:
		frag1_coords, frag1_mol = rdkit_util.load_molecule(mol_pdb_file)
		frag2_coords, frag2_mol = rdkit_util.load_molecule(protein_pdb_file)
		except MoleculeLoadException:
		# Currently handles loading failures by returning None
		# TODO: Is there a better handling procedure?
		logging.warning("Some molecules cannot be loaded by Rdkit. Skipping")
		return None
		system_mol = rdkit_util.merge_molecules(frag1_mol, frag2_mol)
		system_coords = rdkit_util.get_xyz_from_mol(system_mol)

		@@ -232,6 +241,7 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):
		frag2_coords, frag2_mol = self._strip_hydrogens(frag2_coords, frag2_mol)
		system_coords, system_mol = self._strip_hydrogens(system_coords, system_mol)

		try:
		frag1_coords, frag1_neighbor_list, frag1_z = self.featurize_mol(
		frag1_coords, frag1_mol, self.frag1_num_atoms)

		@@ -240,15 +250,23 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):

		system_coords, system_neighbor_list, system_z = self.featurize_mol(
		system_coords, system_mol, self.complex_num_atoms)
		except ValueError as e:
		logging.warning(
		"max_atoms was set too low. Some complexes too large and skipped")
		return None

		return frag1_coords, frag1_neighbor_list, frag1_z, frag2_coords, frag2_neighbor_list, frag2_z, \
		system_coords, system_neighbor_list, system_z

		def get_Z_matrix(self, mol, max_atoms):
		if len(mol.GetAtoms()) > max_atoms:
		raise ValueError("A molecule is larger than permitted by max_atoms. "
		"Increase max_atoms and try again.")
		return pad_array(
		np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]), max_atoms)

		def featurize_mol(self, coords, mol, max_num_atoms):
		logging.info("Featurizing molecule of size: %d", len(mol.GetAtoms()))
		neighbor_list = compute_neighbor_list(coords, self.neighbor_cutoff,
		self.max_num_neighbors, None)
		z = self.get_Z_matrix(mol, max_num_atoms)

deepchem/feat/base_classes.py

+37 −13

Original line number	Diff line number	Diff line
		"""
		Feature calculations.
		"""
		import logging
		import types
		import numpy as np
		from rdkit import Chem
		from rdkit.Chem import rdGeometry, rdMolTransforms
		from deepchem.utils.save import log
		import multiprocessing

		__author__ = "Steven Kearnes"
		__copyright__ = "Copyright 2014, Stanford University"
		__license__ = "BSD 3-clause"


		def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message):
		logging.info(log_message)
		return featurizer._featurize_complex(mol_pdb_file, protein_pdb_file)


		class ComplexFeaturizer(object):
		""""
		Abstract class for calculating features for mol/protein complexes.
		"""

		def featurize_complexes(self, mol_pdbs, protein_pdbs, verbose=True,
		log_every_n=1000):
		def featurize_complexes(self, mol_files, protein_pdbs):
		"""
		Calculate features for mol/protein complexes.

		Parameters
		----------
		mol_pdbs: list
		List of PDBs for molecules. Each PDB should be a list of lines of the
		PDB file.
		mols: list
		List of PDB filenames for molecules.
		protein_pdbs: list
		List of PDBs for proteins. Each PDB should be a list of lines of the
		PDB file.
		List of PDB filenames for proteins.

		Returns
		-------
		features: np.array
		Array of features
		failures: list
		Indices of complexes that failed to featurize.
		"""
		pool = multiprocessing.Pool()
		results = []
		for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_pdbs)):
		log_message = "Featurizing %d / %d" % (i, len(mol_files))
		results.append(
		pool.apply_async(_featurize_complex,
		(self, mol_file, protein_pdb, log_message)))
		pool.close()
		features = []
		for i, (mol_pdb, protein_pdb) in enumerate(zip(mol_pdbs, protein_pdbs)):
		if verbose and i % log_every_n == 0:
		log("Featurizing %d / %d" % (i, len(mol_pdbs)))
		features.append(self._featurize_complex(mol_pdb, protein_pdb))
		failures = []
		for ind, result in enumerate(results):
		new_features = result.get()
		# Handle loading failures which return None
		if new_features is not None:
		features.append(new_features)
		else:
		failures.append(ind)
		features = np.asarray(features)
		return features
		return features, failures

		def _featurize_complex(self, mol_pdb, complex_pdb):
		"""
		@@ -51,6 +74,7 @@ class ComplexFeaturizer(object):
		"""
		raise NotImplementedError('Featurizer is not defined.')


		class Featurizer(object):
		"""
		Abstract class for calculating a set of features for a molecule.

deepchem/feat/rdkit_grid_featurizer.py

+41 −112

Original line number	Diff line number	Diff line
		@@ -5,6 +5,7 @@ __author__ = "Bharath Ramsundar, Evan Feinberg, and Karl Leswing"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "MIT"

		import logging
		import os
		import shutil
		from warnings import warn
		@@ -16,6 +17,7 @@ from collections import Counter
		from rdkit import Chem
		from rdkit.Chem import AllChem
		from deepchem.utils.rdkit_util import load_molecule
		from deepchem.utils.rdkit_util import MoleculeLoadException

		import numpy as np
		from scipy.spatial.distance import cdist
		@@ -27,20 +29,6 @@ TODO(LESWING) add sanitization with rdkit upgrade to 2017.*
		"""


		def get_ligand_filetype(ligand_filename):
		"""Returns the filetype of ligand."""
		if ".mol2" in ligand_filename:
		return "mol2"
		elif ".sdf" in ligand_filename:
		return "sdf"
		elif ".pdbqt" in ligand_filename:
		return "pdbqt"
		elif ".pdb" in ligand_filename:
		return "pdb"
		else:
		raise ValueError("Unrecognized_filename")


		def compute_centroid(coordinates):
		"""Compute the x,y,z centroid of provided coordinates

		@@ -1226,75 +1214,10 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
		]
		raise ValueError('Unknown feature type "%s"' % feature_name)

		def _featurize_complex(self, ligand_ext, ligand_lines, protein_pdb_lines,
		log_message):
		tempdir = tempfile.mkdtemp()
		if log_message is not None:
		log(log_message)

		############################################################## TIMING
		time1 = time.time()
		############################################################## TIMING
		ligand_file = os.path.join(tempdir, "ligand.%s" % ligand_ext)
		with open(ligand_file, "w") as mol_f:
		mol_f.writelines(ligand_lines)
		############################################################## TIMING
		time2 = time.time()
		log("TIMING: Writing ligand took %0.3f s" % (time2 - time1), self.verbose)
		############################################################## TIMING

		############################################################## TIMING
		time1 = time.time()
		############################################################## TIMING
		protein_pdb_file = os.path.join(tempdir, "protein.pdb")
		with open(protein_pdb_file, "w") as protein_f:
		protein_f.writelines(protein_pdb_lines)
		############################################################## TIMING
		time2 = time.time()
		log("TIMING: Writing protein took %0.3f s" % (time2 - time1), self.verbose)
		############################################################## TIMING

		features_dict = self._transform(protein_pdb_file, ligand_file)
		shutil.rmtree(tempdir)
		return list(features_dict.values())

		def featurize_complexes(self, mol_files, protein_pdbs, log_every_n=1000):
		"""
		Calculate features for mol/protein complexes.

		Parameters
		----------
		mols: list
		List of PDB filenames for molecules.
		protein_pdbs: list
		List of PDB filenames for proteins.
		"""
		pool = multiprocessing.Pool()
		results = []
		for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_pdbs)):
		log_message = "Featurizing %d / %d" % (
		i, len(mol_files)) if i % log_every_n == 0 else None
		ligand_ext = get_ligand_filetype(mol_file)
		with open(mol_file) as mol_f:
		mol_lines = mol_f.readlines()
		with open(protein_pdb) as protein_file:
		protein_pdb_lines = protein_file.readlines()
		results.append(
		pool.apply_async(
		_featurize_complex,
		(self, ligand_ext, mol_lines, protein_pdb_lines, log_message)))
		pool.close()
		features = []
		for result in results:
		features += result.get()
		features = np.asarray(features)
		return features

		def _transform(self, protein_pdb, ligand_file):
		"""Computes featurization of protein/ligand complex.
		def _featurize_complex(self, mol_pdb_file, protein_pdb_file):
		"""Computes grid featurization of protein/ligand complex.

		Takes as input files (strings) for pdb of the protein, pdb of the ligand,
		and a directory to save intermediate files.
		Takes as input filenames pdb of the protein, pdb of the ligand.

		This function then computes the centroid of the ligand; decrements this
		centroid from the atomic coordinates of protein and ligand atoms, and then
		@@ -1302,13 +1225,20 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
		saved.

		This function then computes a featurization with scheme specified by the user.
		Parameters
		----------
		mol_pdb_file: Str
		Filename for ligand pdb file.
		protein_pdb_file: Str
		Filename for protein pdb file.
		"""
		try:
		############################################################## TIMING
		time1 = time.time()
		############################################################## TIMING

		protein_xyz, protein_rdk = load_molecule(
		protein_pdb, calc_charges=True, sanitize=self.sanitize)
		protein_pdb_file, calc_charges=True, sanitize=self.sanitize)
		############################################################## TIMING
		time2 = time.time()
		log("TIMING: Loading protein coordinates took %0.3f s" % (time2 - time1),
		@@ -1318,12 +1248,15 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
		time1 = time.time()
		############################################################## TIMING
		ligand_xyz, ligand_rdk = load_molecule(
		ligand_file, calc_charges=True, sanitize=self.sanitize)
		mol_pdb_file, calc_charges=True, sanitize=self.sanitize)
		############################################################## TIMING
		time2 = time.time()
		log("TIMING: Loading ligand coordinates took %0.3f s" % (time2 - time1),
		self.verbose)
		############################################################## TIMING
		except MoleculeLoadException:
		logging.warning("Some molecules cannot be loaded by Rdkit. Skipping")
		return None

		############################################################## TIMING
		time1 = time.time()
		@@ -1346,7 +1279,7 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
		rotated_system = rotate_molecules([protein_xyz, ligand_xyz])
		transformed_systems[(i + 1, 0)] = rotated_system

		features = {}
		features_dict = {}
		for system_id, (protein_xyz, ligand_xyz) in transformed_systems.items():
		feature_arrays = []
		for is_flat, function_name in self.feature_types:
		@@ -1362,11 +1295,13 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
		feature_arrays += result

		if self.flatten:
		features[system_id] = np.concatenate(
		features_dict[system_id] = np.concatenate(
		[feature_array.flatten() for feature_array in feature_arrays])
		else:
		features[system_id] = np.concatenate(feature_arrays, axis=-1)
		features_dict[system_id] = np.concatenate(feature_arrays, axis=-1)

		# TODO(rbharath): Is this squeeze OK?
		features = np.squeeze(np.array(list(features_dict.values())))
		return features

		def _voxelize(self,
		@@ -1464,9 +1399,3 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
		feature_vector[0] += len(feature_list)

		return feature_vector


		def _featurize_complex(featurizer, ligand_ext, ligand_lines, protein_pdb_lines,
		log_message):
		return featurizer._featurize_complex(ligand_ext, ligand_lines,
		protein_pdb_lines, log_message)

deepchem/feat/tests/test_rdkit_grid_features.py

+29 −38

Original line number	Diff line number	Diff line
		@@ -33,20 +33,6 @@ class TestHelperFunctions(unittest.TestCase):
		'3ws9_protein_fixer_rdkit.pdb')
		self.ligand_file = os.path.join(current_dir, '3ws9_ligand.sdf')

		def test_get_ligand_filetype(self):

		supported_extensions = ['mol2', 'sdf', 'pdb', 'pdbqt']
		# some users might try to read smiles with this function
		unsupported_extensions = ['smi', 'ism']

		for extension in supported_extensions:
		fname = 'molecule.%s' % extension
		self.assertEqual(rgf.get_ligand_filetype(fname), extension)

		for extension in unsupported_extensions:
		fname = 'molecule.%s' % extension
		self.assertRaises(ValueError, rgf.get_ligand_filetype, fname)

		def test_load_molecule(self):
		# adding hydrogens and charges is tested in dc.utils
		for add_hydrogens in (True, False):
		@@ -367,7 +353,8 @@ class TestFeaturizationFunctions(unittest.TestCase):
		prot_xyz,
		prot_rdk,
		lig_xyz,
		lig_rdk,)
		lig_rdk,
		)
		prot_dict_dist, lig_dict_dist = rgf.featurize_binding_pocket_ecfp(
		prot_xyz, prot_rdk, lig_xyz, lig_rdk, pairwise_distances=distance)
		# ...but first check if we actually got two dicts
		@@ -383,13 +370,15 @@ class TestFeaturizationFunctions(unittest.TestCase):
		prot_rdk,
		lig_xyz,
		lig_rdk,
		cutoff=2.0,)
		cutoff=2.0,
		)
		prot_dict_d6, lig_dict_d6 = rgf.featurize_binding_pocket_ecfp(
		prot_xyz,
		prot_rdk,
		lig_xyz,
		lig_rdk,
		cutoff=6.0,)
		cutoff=6.0,
		)
		self.assertLess(len(prot_dict_d2), len(prot_dict))
		# ligands are typically small so all atoms might be present
		self.assertLessEqual(len(lig_dict_d2), len(lig_dict))
		@@ -402,7 +391,8 @@ class TestFeaturizationFunctions(unittest.TestCase):
		prot_rdk,
		lig_xyz,
		lig_rdk,
		ecfp_degree=3,)
		ecfp_degree=3,
		)
		self.assertNotEqual(prot_dict_e3, prot_dict)
		self.assertNotEqual(lig_dict_e3, lig_dict)

		@@ -419,7 +409,8 @@ class TestFeaturizationFunctions(unittest.TestCase):
		prot_rdk,
		lig_rdk,
		distance,
		bins,)
		bins,
		)

		self.assertIsInstance(splif_dict, dict)
		for (prot_idx, lig_idx), ecfp_pair in splif_dict.items():
		@@ -478,7 +469,7 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
		# test if default parameters work
		featurizer = rgf.RdkitGridFeaturizer()
		self.assertIsInstance(featurizer, rgf.RdkitGridFeaturizer)
		feature_tensor = featurizer.featurize_complexes([self.ligand_file],
		feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
		[self.protein_file])
		self.assertIsInstance(feature_tensor, np.ndarray)

		@@ -490,7 +481,7 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
		ecfp_power=9,
		splif_power=9,
		flatten=True)
		feature_tensor = featurizer.featurize_complexes([self.ligand_file],
		feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
		[self.protein_file])
		self.assertIsInstance(feature_tensor, np.ndarray)

		@@ -499,7 +490,7 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
		featurizer = rgf.RdkitGridFeaturizer(
		feature_types=['ecfp_hashed'], flatten=False)
		featurizer.flatten = True # False should be ignored with ecfp_hashed
		feature_tensor = featurizer.featurize_complexes([self.ligand_file],
		feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
		[self.protein_file])
		self.assertIsInstance(feature_tensor, np.ndarray)
		self.assertEqual(feature_tensor.shape, (1, 2 * 2**featurizer.ecfp_power))
		@@ -516,13 +507,13 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
		splif_power=splif_power,
		flatten=False,
		sanitize=True)
		feature_tensor = featurizer.featurize_complexes([self.ligand_file],
		feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
		[self.protein_file])
		self.assertIsInstance(feature_tensor, np.ndarray)
		voxel_total_len = (
		2**ecfp_power +
		len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power +
		len(featurizer.cutoffs['hbond_dist_bins']) + 5)
		len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power + len(
		featurizer.cutoffs['hbond_dist_bins']) + 5)
		self.assertEqual(feature_tensor.shape, (1, 20, 20, 20, voxel_total_len))

		# test flat features
		@@ -532,13 +523,13 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
		ecfp_power=ecfp_power,
		splif_power=splif_power,
		sanitize=True)
		feature_tensor = featurizer.featurize_complexes([self.ligand_file],
		feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
		[self.protein_file])
		self.assertIsInstance(feature_tensor, np.ndarray)
		flat_total_len = (
		3 * 2**ecfp_power +
		len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power +
		len(featurizer.cutoffs['hbond_dist_bins']))
		len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power + len(
		featurizer.cutoffs['hbond_dist_bins']))
		self.assertEqual(feature_tensor.shape, (1, flat_total_len))

		# check if aromatic features are ignores if sanitize=False
		@@ -552,7 +543,7 @@ class TestRdkitGridFeaturizer(unittest.TestCase):

		self.assertTrue('pi_stack' not in featurizer.feature_types)
		self.assertTrue('cation_pi' not in featurizer.feature_types)
		feature_tensor = featurizer.featurize_complexes([self.ligand_file],
		feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
		[self.protein_file])
		self.assertIsInstance(feature_tensor, np.ndarray)
		total_len = voxel_total_len + flat_total_len - 3 - 2**ecfp_power
		@@ -580,9 +571,9 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
		feature_types=['voxel_combined'],
		flatten=False,
		sanitize=True)
		feature_tensors = featurizer.featurize_complexes([self.ligand_file],
		feature_tensors, _ = featurizer.featurize_complexes([self.ligand_file],
		[self.protein_file])
		self.assertEqual(len(feature_tensors), 4)
		self.assertEqual(feature_tensors.shape, (1, 4, 16, 16, 16, 40))

		def test_voxelize(self):
		prot_xyz, prot_rdk = rgf.load_molecule(self.protein_file)

Admin message