changes (d54147be) · Commits · 钟慕尧 / deepchem

deepchem/utils/rdkit_util.py

+357 −255

Original line number	Diff line number	Diff line
		import logging
		"""
		RDKit Utilities.

		import numpy as np
		import os
		This file contains utilities that compute useful properties of
		molecules. Some of these are simple cleanup utilities, and
		others are more sophisticated functions that detect chemical
		properties of molecules.
		"""

		try:
		from StringIO import StringIO
		except ImportError:
		import os
		import logging
		import itertools
		import numpy as np
		from io import StringIO
		from copy import deepcopy
		from collections import Counter
		from deepchem.utils import pdbqt_utils
		from deepchem.utils.pdbqt_utils import convert_mol_to_pdbqt
		from deepchem.utils.pdbqt_utils import convert_protein_to_pdbqt
		from deepchem.utils.geometry_utils import angle_between
		from deepchem.utils.geometry_utils import is_angle_within_cutoff
		from deepchem.utils.geometry_utils import generate_random_rotation_matrix

		logger = logging.getLogger(__name__)

		class MoleculeLoadException(Exception):

		@@ -16,9 +30,18 @@ class MoleculeLoadException(Exception):


		def get_xyz_from_mol(mol):
		"""
		returns an m x 3 np array of 3d coords
		of given rdkit molecule
		"""Extracts a numpy array of coordinates from a molecules.

		Returns a `(N, 3)` numpy array of 3d coords of given rdkit molecule

		Parameters
		----------
		mol: rdkit Molecule
		Molecule to extract coordinates for

		Returns
		-------
		Numpy ndarray of shape `(N, 3)` where `N = mol.GetNumAtoms()`.
		"""
		xyz = np.zeros((mol.GetNumAtoms(), 3))
		conf = mol.GetConformer()
		@@ -29,13 +52,55 @@ def get_xyz_from_mol(mol):
		xyz[i, 2] = position.z
		return (xyz)


		def add_hydrogens_to_mol(mol):
		"""
		Add hydrogens to a molecule object
		TODO (LESWING) see if there are more flags to add here for default
		:param mol: Rdkit Mol
		:return: Rdkit Mol

		Parameters
		----------
		mol: Rdkit Mol
		Molecule to hydrogenate

		Returns
		-------
		Rdkit Mol

		Note
		----
		This function requires RDKit and PDBFixer to be installed.
		"""
		return apply_pdbfixer(mol, hydrogenate=True)


		def apply_pdbfixer(mol, add_missing=True, hydrogenate=True, pH=7.4,
		remove_heterogens=True, is_protein=True):
		"""
		Apply PDBFixer to a molecule to try to clean it up.

		Parameters
		----------
		mol: Rdkit Mol
		Molecule to hydrogenate
		add_missing: bool, optional
		If true, add in missing residues and atoms
		hydrogenate: bool, optional
		If true, add hydrogens at specified pH
		pH: float, optional
		The pH at which hydrogens will be added if `hydrogenate==True`. Set to 7.4 by default.
		remove_heterogens: bool, optional
		Often times, PDB files come with extra waters and salts attached.
		If this field is set, remove these heterogens.
		is_protein: bool, optional
		If false, then don't remove heterogens (since this molecule is
		itself a heterogen).

		Returns
		-------
		Rdkit Mol

		Note
		----
		This function requires RDKit and PDBFixer to be installed.
		"""
		molecule_file = None
		try:
		@@ -46,10 +111,15 @@ def add_hydrogens_to_mol(mol):
		pdb_stringio.seek(0)
		import pdbfixer
		fixer = pdbfixer.PDBFixer(pdbfile=pdb_stringio)
		if add_missing:
		fixer.findMissingResidues()
		fixer.findMissingAtoms()
		fixer.addMissingAtoms()
		fixer.addMissingHydrogens(7.4)
		if hydrogenate:
		fixer.addMissingHydrogens(pH)
		if is_protein and remove_heterogens:
		# False here specifies that water is to be removed
		fixer.removeHeterogens(False)

		hydrogenated_io = StringIO()
		import simtk
		@@ -59,7 +129,7 @@ def add_hydrogens_to_mol(mol):
		return Chem.MolFromPDBBlock(
		hydrogenated_io.read(), sanitize=False, removeHs=False)
		except ValueError as e:
		logging.warning("Unable to add hydrogens %s", e)
		logger.warning("Unable to add hydrogens %s", e)
		raise MoleculeLoadException(e)
		finally:
		try:
		@@ -69,10 +139,10 @@ def add_hydrogens_to_mol(mol):


		def compute_charges(mol):
		"""
		Attempt to compute Gasteiger Charges on Mol
		This also has the side effect of calculating charges on mol.
		The mol passed into this function has to already have been sanitized
		"""Attempt to compute Gasteiger Charges on Mol

		This also has the side effect of calculating charges on mol. The
		mol passed into this function has to already have been sanitized

		Params
		------
		@@ -80,44 +150,125 @@ def compute_charges(mol):

		Returns
		-------
		molecule with charges
		No return since updates in place.

		Note
		----
		This function requires RDKit to be installed.
		"""
		from rdkit.Chem import AllChem
		try:
		# Updates charges in place
		AllChem.ComputeGasteigerCharges(mol)
		except Exception as e:
		logging.exception("Unable to compute charges for mol")
		raise MoleculeLoadException(e)
		return mol

		def load_complex(molecular_complex,
		add_hydrogens=True,
		calc_charges=True,
		pdbfix=True,
		sanitize=True):
		"""Loads a molecular complex.

		Given some representation of a molecular complex, returns a list of
		tuples, where each tuple contains (xyz coords, rdkit object) for
		that constituent molecule in the complex.

		For now, assumes that molecular_complex is a tuple of filenames.

		Parameters
		----------
		molecular_complex: list or str
		If list, each entry should be a filename for a constituent
		molecule in complex. If str, should be the filename of a file that
		holds the full complex.
		add_hydrogens: bool, optional
		If true, add hydrogens via pdbfixer
		calc_charges: bool, optional
		If true, add charges via rdkit
		pdbfix: bool, optional
		If true, apply pdbfixer to clean up this molecule.
		sanitize: bool, optional
		If true, sanitize molecules via rdkit

		Returns
		-------
		List of tuples (xyz, mol)

		Note
		----
		This function requires RDKit to be installed.
		"""
		if isinstance(molecular_complex, str):
		molecule_complex = [molecular_complex]
		fragments = []
		for mol in molecular_complex:
		loaded = load_molecule(mol,
		add_hydrogens=add_hydrogens,
		calc_charges=calc_charges,
		pdbfix=pdbfix,
		sanitize=sanitize)
		if isinstance(loaded, list):
		fragments += loaded
		else:
		fragments.append(loaded)
		return fragments



		def load_molecule(molecule_file,
		add_hydrogens=True,
		calc_charges=True,
		sanitize=False):
		"""
		Converts molecule file to (xyz-coords, obmol object)
		sanitize=True,
		pdbfix=True):
		"""Converts molecule file to (xyz-coords, obmol object)

		Given molecule_file, returns a tuple of xyz coords of molecule
		and an rdkit object representing that molecule
		:param molecule_file: filename for molecule
		:param add_hydrogens: should add hydrogens via pdbfixer?
		:param calc_charges: should add charges vis rdkit
		:return: (xyz, mol)
		and an rdkit object representing that molecule in that order `(xyz,
		rdkit_mol)`. This ordering convention is used in the code in a few
		places.

		Parameters
		----------
		molecule_file: str
		filename for molecule
		add_hydrogens: bool, optional
		If true, add hydrogens via pdbfixer
		calc_charges: bool, optional
		If true, add charges via rdkit
		sanitize: bool, optional
		If true, sanitize molecules via rdkit
		pdbfix: bool, optional
		If true, apply pdbfixer to clean up this molecule.

		Returns
		-------
		Tuple (xyz, mol) if file contains single molecule. Else returns a
		list of the tuples for the separate molecules in this list.

		Note
		----
		This function requires RDKit to be installed.
		"""
		from rdkit import Chem
		from rdkit.Chem.rdchem import AtomValenceException
		from_pdb = False
		if ".mol2" in molecule_file:
		my_mol = Chem.MolFromMol2File(molecule_file, sanitize=False, removeHs=False)
		elif ".sdf" in molecule_file:
		suppl = Chem.SDMolSupplier(str(molecule_file), sanitize=False)
		# TODO: This is wrong. Should return all molecules
		my_mol = suppl[0]
		elif ".pdbqt" in molecule_file:
		pdb_block = pdbqt_to_pdb(molecule_file)
		pdb_block = pdbqt_utils.pdbqt_to_pdb(molecule_file)
		my_mol = Chem.MolFromPDBBlock(
		str(pdb_block), sanitize=False, removeHs=False)
		from_pdb = True
		elif ".pdb" in molecule_file:
		my_mol = Chem.MolFromPDBFile(
		str(molecule_file), sanitize=False, removeHs=False)
		from_pdb = True
		else:
		raise ValueError("Unrecognized file type")

		@@ -125,12 +276,15 @@ def load_molecule(molecule_file,
		raise ValueError("Unable to read non None Molecule Object")

		if add_hydrogens or calc_charges:
		my_mol = add_hydrogens_to_mol(my_mol)
		# TODO: mol should be always sanitized when charges are calculated
		# can't change it now because it would break a lot of examples
		# We assume if it's from a PDB, it should be a protein
		my_mol = apply_pdbfixer(my_mol, hydrogenate=add_hydrogens, is_protein=from_pdb)
		if sanitize:
		try:
		Chem.SanitizeMol(my_mol)
		except AtomValenceException:
		logger.warn("Mol %s failed valence check" % Chem.MolToSmiles(my_mol))
		if calc_charges:
		# This updates in place
		compute_charges(my_mol)

		xyz = get_xyz_from_mol(my_mol)
		@@ -138,45 +292,28 @@ def load_molecule(molecule_file,
		return xyz, my_mol


		def pdbqt_file_hack_protein(mol, outfile):
		"""
		Hack to convert a pdb protein into a pdbqt protein
		:param mol: rdkit Mol of protein
		:param outfile: filename which already has a valid pdb representation of mol
		"""
		lines = [x.strip() for x in open(outfile).readlines()]
		out_lines = []
		for line in lines:
		if "ROOT" in line or "ENDROOT" in line or "TORSDOF" in line:
		out_lines.append("%s\n" % line)
		continue
		if not line.startswith("ATOM"):
		continue
		line = line[:66]
		atom_index = int(line.split()[1])
		atom = mol.GetAtoms()[atom_index - 1]
		line = "%s +0.000 %s\n" % (line, atom.GetSymbol().ljust(2))
		out_lines.append(line)
		with open(outfile, 'w') as fout:
		for line in out_lines:
		fout.write(line)
		def write_molecule(mol, outfile, is_protein=False):
		"""Write molecule to a file

		This function writes a representation of the provided molecule to
		the specified `outfile`. Doesn't return anything.

		def pdbqt_file_hack_ligand(mol, outfile):
		"""
		Hack to convert a pdb ligand into a pdbqt ligand
		:param mol: rdkit Mol Object
		:param outfile: filename which already has a valid pdb representation of mol
		"""
		PdbqtLigandWriter(mol, outfile).convert()
		Parameters
		----------
		mol: rdkit Mol
		Molecule to write
		outfile: str
		Filename to write mol to
		is_protein: bool, optional
		Is this molecule a protein?

		Note
		----
		This function requires RDKit to be installed.

		def write_molecule(mol, outfile, is_protein=False):
		"""
		Write molecule to a file
		:param mol: rdkit Mol object
		:param outfile: filename to write mol to
		:param is_protein: is this molecule a protein?
		Raises
		------
		ValueError: if `outfile` isn't of a supported format.
		"""
		from rdkit import Chem
		if ".pdbqt" in outfile:
		@@ -184,9 +321,9 @@ def write_molecule(mol, outfile, is_protein=False):
		writer.write(mol)
		writer.close()
		if is_protein:
		pdbqt_file_hack_protein(mol, outfile)
		convert_protein_to_pdbqt(mol, outfile)
		else:
		pdbqt_file_hack_ligand(mol, outfile)
		convert_mol_to_pdbqt(mol, outfile)
		elif ".pdb" in outfile:
		writer = Chem.PDBWriter(outfile)
		writer.write(mol)
		@@ -199,192 +336,157 @@ def write_molecule(mol, outfile, is_protein=False):
		raise ValueError("Unsupported Format")


		def pdbqt_to_pdb(filename):
		pdbqt_data = open(filename).readlines()
		pdb_block = ""
		for line in pdbqt_data:
		pdb_block += "%s\n" % line[:66]
		return pdb_block


		def merge_molecules_xyz(protein_xyz, ligand_xyz):
		"""Merges coordinates of protein and ligand.
		"""
		return np.array(np.vstack(np.vstack((protein_xyz, ligand_xyz))))


		def merge_molecules(ligand, protein):
		"""Helper method to merge ligand and protein molecules."""
		from rdkit.Chem import rdmolops
		return rdmolops.CombineMols(ligand, protein)


		class PdbqtLigandWriter(object):
		"""
		Create a torsion tree and write to pdbqt file
		"""
		def merge_molecules_xyz(xyzs):
		"""Merges coordinates of multiple molecules.

		def __init__(self, mol, outfile):
		Parameters
		----------
		xyzs: List
		List of numpy arrays each of shape `(N_i, 3)` where `N_i` is the number of atoms in the i-th atom.
		"""
		:param mol: The molecule whose value is stored in pdb format in outfile
		:param outfile: a valid pdb file with the extention .pdbqt
		"""
		self.mol = mol
		self.outfile = outfile
		return np.array(np.vstack(np.vstack(xyzs)))

		def convert(self):
		"""
		The single public function of this class.
		It converts a molecule and a pdb file into a pdbqt file stored in outfile
		"""
		import networkx as nx
		self._create_pdb_map()
		self._mol_to_graph()
		self._get_rotatable_bonds()

		for bond in self.rotatable_bonds:
		self.graph.remove_edge(bond[0], bond[1])
		self.components = [x for x in nx.connected_components(self.graph)]
		self._create_component_map(self.components)

		self.used_partitions = set()
		self.lines = []
		root = max(enumerate(self.components), key=lambda x: len(x[1]))[0]
		self.lines.append("ROOT\n")
		for atom in self.components[root]:
		self.lines.append(self._writer_line_for_atom(atom))
		self.lines.append("ENDROOT\n")
		self.used_partitions.add(root)
		for bond in self.rotatable_bonds:
		valid, next_partition = self._valid_bond(bond, root)
		if not valid:
		continue
		self._dfs(next_partition, bond)
		self.lines.append("TORSDOF %s" % len(self.rotatable_bonds))
		with open(self.outfile, 'w') as fout:
		for line in self.lines:
		fout.write(line)

		def _dfs(self, current_partition, bond):
		"""
		This function does a depth first search throught he torsion tree
		:param current_partition: The current partition to expand
		:param bond: the bond which goes from the previous partition into this partition
		"""
		if self._get_component_for_atom(bond[1]) != current_partition:
		bond = (bond[1], bond[0])
		self.used_partitions.add(self._get_component_for_atom(bond[0]))
		self.used_partitions.add(self._get_component_for_atom(bond[1]))
		self.lines.append("BRANCH %4s %4s\n" % (bond[0] + 1, bond[1] + 1))
		for atom in self.components[current_partition]:
		self.lines.append(self._writer_line_for_atom(atom))
		for b in self.rotatable_bonds:
		valid, next_partition = self._valid_bond(b, current_partition)
		if not valid:
		continue
		self._dfs(next_partition, b)
		self.lines.append("ENDBRANCH %4s %4s\n" % (bond[0] + 1, bond[1] + 1))
		def merge_molecules(molecules):
		"""Helper method to merge two molecules.

		def _get_component_for_atom(self, atom_number):
		"""
		:param atom_number: the atom number to check for component_id
		:return: the component_id that atom_number is part of
		"""
		return self.comp_map[atom_number]
		Parameters
		----------
		molecules: list
		List of rdkit molecules

		def _valid_bond(self, bond, current_partition):
		"""
		used to check if a bond goes from the current partition into a partition
		that is not yet explored
		:param bond: the bond to check if it goes to an unexplored partition
		:param current_partition: the current partition of the DFS
		:return: is_valid, next_partition
		Returns
		-------
		merged: rdkit molecule
		"""
		part1 = self.comp_map[bond[0]]
		part2 = self.comp_map[bond[1]]
		if part1 != current_partition and part2 != current_partition:
		return False, 0
		if part1 == current_partition:
		next_partition = part2
		from rdkit.Chem import rdmolops
		if len(molecules) == 0:
		return None
		elif len(molecules) == 1:
		return molecules[0]
		else:
		next_partition = part1
		return not next_partition in self.used_partitions, next_partition

		def _writer_line_for_atom(self, atom_number):
		"""

		:param atom_number:
		:return:
		"""
		return self.pdb_map[atom_number]

		def _create_component_map(self, components):
		"""
		Creates a Map From atom_idx to disconnected_component_id
		:param components:
		:return:
		"""
		comp_map = {}
		for i in range(self.mol.GetNumAtoms()):
		for j in range(len(components)):
		if i in components[j]:
		comp_map[i] = j
		break
		self.comp_map = comp_map

		def _create_pdb_map(self):
		"""
		create self.pdb_map. This is a map from rdkit atom number to
		its line in the pdb file. We also add the two additional columns
		required for pdbqt (charge, symbol)

		note rdkit atoms are 0 indexes and pdb files are 1 indexed
		:return:
		"""
		lines = [x.strip() for x in open(self.outfile).readlines()]
		lines = filter(lambda x: x.startswith("HETATM") or x.startswith("ATOM"),
		lines)
		lines = [x[:66] for x in lines]
		pdb_map = {}
		for line in lines:
		my_values = line.split()
		atom_number = int(my_values[1])
		atom_symbol = my_values[2]
		atom_symbol = ''.join([i for i in atom_symbol if not i.isdigit()])
		line = line.replace("HETATM", "ATOM ")
		line = "%s +0.000 %s\n" % (line, atom_symbol.ljust(2))
		pdb_map[atom_number - 1] = line
		self.pdb_map = pdb_map

		def _mol_to_graph(self):
		"""
		Convert self.mol into a graph representation
		atoms are nodes, and bonds are vertices
		store as self.graph
		"""
		import networkx as nx
		G = nx.Graph()
		num_atoms = self.mol.GetNumAtoms()
		G.add_nodes_from(range(num_atoms))
		for i in range(self.mol.GetNumBonds()):
		from_idx = self.mol.GetBonds()[i].GetBeginAtomIdx()
		to_idx = self.mol.GetBonds()[i].GetEndAtomIdx()
		G.add_edge(from_idx, to_idx)
		self.graph = G

		def _get_rotatable_bonds(self):
		"""
		https://github.com/rdkit/rdkit/blob/f4529c910e546af590c56eba01f96e9015c269a6/Code/GraphMol/Descriptors/Lipinski.cpp#L107
		Taken from rdkit source to find which bonds are rotatable
		store rotatable bonds in (from_atom, to_atom)
		"""
		combined = molecules[0]
		for nextmol in molecules[1:]:
		combined = rdmolops.CombineMols(combined, nextmol)
		return combined

		def compute_contact_centroid(molecular_complex, cutoff=4.5):
		"""Computes the (x,y,z) centroid of the contact regions of this molecular complex.

		For a molecular complex, it's necessary for various featurizations
		that compute voxel grids to find a reasonable center for the
		voxelization. This function computes the centroid of all the contact
		atoms, defined as an atom that's within `cutoff` Angstroms of an
		atom from a different molecule.

		Parameters
		----------
		molecular_complex: Object
		A representation of a molecular complex, produced by
		`rdkit_util.load_complex`.
		cutoff: float, optional
		The distance in Angstroms considered for computing contacts.
		"""
		fragments = reduce_molecular_complex_to_contacts(molecular_complex, cutoff)
		coords = [frag[0] for frag in fragments]
		contact_coords = merge_molecules_xyz(coords)
		centroid = np.mean(contact_coords, axis=0)
		return (centroid)

		def compute_ring_center(mol, ring_indices):
		"""Computes 3D coordinates of a center of a given ring.

		Parameters:
		-----------
		mol: rdkit.rdchem.Mol
		Molecule containing a ring
		ring_indices: array-like
		Indices of atoms forming a ring

		Returns:
		--------
		ring_centroid: np.ndarray
		Position of a ring center
		"""
		conformer = mol.GetConformer()
		ring_xyz = np.zeros((len(ring_indices), 3))
		for i, atom_idx in enumerate(ring_indices):
		atom_position = conformer.GetAtomPosition(atom_idx)
		ring_xyz[i] = np.array(atom_position)
		ring_centroid = compute_centroid(ring_xyz)
		return ring_centroid

		def compute_ring_normal(mol, ring_indices):
		"""Computes normal to a plane determined by a given ring.

		Parameters:
		-----------
		mol: rdkit.rdchem.Mol
		Molecule containing a ring
		ring_indices: array-like
		Indices of atoms forming a ring

		Returns:
		--------
		normal: np.ndarray
		Normal vector
		"""
		conformer = mol.GetConformer()
		points = np.zeros((3, 3))
		for i, atom_idx in enumerate(ring_indices[:3]):
		atom_position = conformer.GetAtomPosition(atom_idx)
		points[i] = np.array(atom_position)

		v1 = points[1] - points[0]
		v2 = points[2] - points[0]
		normal = np.cross(v1, v2)
		return normal

		def rotate_molecules(mol_coordinates_list):
		"""Rotates provided molecular coordinates.

		Pseudocode:
		1. Generate random rotation matrix. This matrix applies a
		random transformation to any 3-vector such that, were the
		random transformation repeatedly applied, it would randomly
		sample along the surface of a sphere with radius equal to
		the norm of the given 3-vector cf.
		generate_random_rotation_matrix() for details
		2. Apply R to all atomic coordinates.
		3. Return rotated molecule

		Parameters
		----------
		mol_coordinates_list: list
		Elements of list must be (N_atoms, 3) shaped arrays
		"""
		R = generate_random_rotation_matrix()
		rotated_coordinates_list = []

		for mol_coordinates in mol_coordinates_list:
		coordinates = deepcopy(mol_coordinates)
		rotated_coordinates = np.transpose(np.dot(R, np.transpose(coordinates)))
		rotated_coordinates_list.append(rotated_coordinates)

		return (rotated_coordinates_list)

		def compute_all_ecfp(mol, indices=None, degree=2):
		"""Obtain molecular fragment for all atoms emanating outward to given degree.

		For each fragment, compute SMILES string (for now) and hash to
		an int. Return a dictionary mapping atom index to hashed
		SMILES.
		"""

		ecfp_dict = {}
		from rdkit import Chem
		from rdkit.Chem import rdmolops
		pattern = Chem.MolFromSmarts(
		"[!$(#)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])("
		"[CH3])[CH3])&!$([CD3](=[N,O,S])-!@[#7,O,S!D1])&!$([#7,O,S!D1]-!@[CD3]="
		"[N,O,S])&!$([CD3](=[N+])-!@[#7!D1])&!$([#7!D1]-!@[CD3]=[N+])]-!@[!$(*#"
		"*)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])([CH3])"
		"[CH3])]")
		rdmolops.FastFindRings(self.mol)
		self.rotatable_bonds = self.mol.GetSubstructMatches(pattern)
		for i in range(mol.GetNumAtoms()):
		if indices is not None and i not in indices:
		continue
		env = Chem.FindAtomEnvironmentOfRadiusN(mol, degree, i, useHs=True)
		submol = Chem.PathToSubmol(mol, env)
		smile = Chem.MolToSmiles(submol)
		ecfp_dict[i] = "%s,%s" % (mol.GetAtoms()[i].GetAtomicNum(), smile)

		return ecfp_dict

deepchem/utils/test/test_rdkit_util.py

+63 −0

File changed.

Preview size limit exceeded, changes collapsed.

Admin message