Cleanup pdbqt utils (a15ddcc7) · Commits · 钟慕尧 / deepchem

deepchem/utils/pdbqt_utils.py

+248 −215

Original line number	Diff line number	Diff line
		@@ -30,22 +30,11 @@ def pdbqt_to_pdb(filename=None, pdbqt_data=None):
		return pdb_block


		def convert_mol_to_pdbqt(mol, outfile):
		"""Convert a rdkit molecule into a pdbqt ligand

		Parameters
		----------
		mol: rdkit Mol
		Ligand molecule
		outfile: str
		filename which already has a valid pdb representation of mol
		"""
		PdbqtLigandWriter(mol, outfile).convert()


		def convert_protein_to_pdbqt(mol, outfile):
		"""Convert a protein PDB file into a pdbqt file.

		Writes the extra PDBQT terms directly to `outfile`.

		Parameters
		----------
		mol: rdkit Mol
		@@ -71,19 +60,73 @@ def convert_protein_to_pdbqt(mol, outfile):
		fout.write(line)


		class PdbqtLigandWriter(object):
		def mol_to_graph(mol):
		"""Convert RDKit Mol to NetworkX graph

		Convert mol into a graph representation atoms are nodes, and bonds
		are vertices stored as graph

		Parameters
		----------
		mol: rdkit Mol
		The molecule to convert into a graph.

		Returns
		-------
		graph: networkx.Graph
		Contains atoms indices as nodes, edges as bonds.
		"""
		Create a torsion tree and write to pdbqt file
		import networkx as nx
		G = nx.Graph()
		num_atoms = mol.GetNumAtoms()
		G.add_nodes_from(range(num_atoms))
		for i in range(mol.GetNumBonds()):
		from_idx = mol.GetBonds()[i].GetBeginAtomIdx()
		to_idx = mol.GetBonds()[i].GetEndAtomIdx()
		G.add_edge(from_idx, to_idx)
		return G

		The torsion tree represents rotatable bonds in the molecule.

		Note
		----
		This class requires RDKit to be installed.
		def get_rotatable_bonds(mol):
		"""
		https://github.com/rdkit/rdkit/blob/f4529c910e546af590c56eba01f96e9015c269a6/Code/GraphMol/Descriptors/Lipinski.cpp#L107

		Taken from rdkit source to find which bonds are rotatable store
		rotatable bonds in (from_atom, to_atom)

		Parameters
		----------
		mol: rdkit Mol
		Ligand molecule

		def __init__(self, mol, outfile):
		Returns
		-------
		rotatable_bonds: list
		List of rotatable bonds in molecule
		"""
		from rdkit import Chem
		from rdkit.Chem import rdmolops
		pattern = Chem.MolFromSmarts(
		"[!$(#)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])("
		"[CH3])[CH3])&!$([CD3](=[N,O,S])-!@[#7,O,S!D1])&!$([#7,O,S!D1]-!@[CD3]="
		"[N,O,S])&!$([CD3](=[N+])-!@[#7!D1])&!$([#7!D1]-!@[CD3]=[N+])]-!@[!$(*#"
		"*)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])([CH3])"
		"[CH3])]")
		rdmolops.FastFindRings(mol)
		rotatable_bonds = mol.GetSubstructMatches(pattern)
		return rotatable_bonds


		def convert_mol_to_pdbqt(mol, outfile):
		"""Writes the provided ligand molecule to specified file in pdbqt format.

		Creates a torsion tree and write to pdbqt file. The torsion tree
		represents rotatable bonds in the molecule.

		Note
		----
		This function requires RDKit to be installed.

		Parameters
		----------
		mol: rdkit Mol
		@@ -91,192 +134,182 @@ class PdbqtLigandWriter(object):
		outfile: str
		Filename for a valid pdb file with the extention .pdbqt
		"""
		self.mol = mol
		self.outfile = outfile

		def convert(self):
		"""Writes a PDBQT file for `self.mol`.

		The single public function of this class. It converts a molecule
		and a pdb file into a pdbqt file stored in outfile
		"""
		import networkx as nx
		self._create_pdb_map()
		self._mol_to_graph()
		self._get_rotatable_bonds()

		for bond in self.rotatable_bonds:
		self.graph.remove_edge(bond[0], bond[1])
		self.components = [x for x in nx.connected_components(self.graph)]
		self._create_component_map(self.components)

		self.used_partitions = set()
		self.lines = []
		root = max(enumerate(self.components), key=lambda x: len(x[1]))[0]
		self.lines.append("ROOT\n")
		for atom in self.components[root]:
		self.lines.append(self._writer_line_for_atom(atom))
		self.lines.append("ENDROOT\n")
		self.used_partitions.add(root)
		for bond in self.rotatable_bonds:
		valid, next_partition = self._valid_bond(bond, root)
		# Walk through the original file and extract ATOM/HETATM lines and
		# add PDBQT charge annotations.
		pdb_map = _create_pdb_map(outfile)
		graph = mol_to_graph(mol)
		rotatable_bonds = get_rotatable_bonds(mol)

		# Remove rotatable bonds from this molecule
		for bond in rotatable_bonds:
		graph.remove_edge(bond[0], bond[1])
		# Get the connected components now that the rotatable bonds have
		# been removed.
		components = [x for x in nx.connected_components(graph)]
		comp_map = _create_component_map(mol, components)

		used_partitions = set()
		lines = []
		# The root is the largest connected component.
		root = max(enumerate(components), key=lambda x: len(x[1]))[0]
		# Write the root component
		lines.append("ROOT\n")
		for atom in components[root]:
		lines.append(pdb_map[atom])
		lines.append("ENDROOT\n")
		# We've looked at the root, so take note of that
		used_partitions.add(root)
		for bond in rotatable_bonds:
		valid, next_partition = _valid_bond(used_partitions, bond, root, comp_map)
		if not valid:
		continue
		self._dfs(next_partition, bond)
		self.lines.append("TORSDOF %s" % len(self.rotatable_bonds))
		with open(self.outfile, 'w') as fout:
		for line in self.lines:
		_dfs(used_partitions, next_partition, bond, components, rotatable_bonds,
		lines, pdb_map, comp_map)
		lines.append("TORSDOF %s" % len(rotatable_bonds))
		with open(outfile, 'w') as fout:
		for line in lines:
		fout.write(line)

		def _dfs(self, current_partition, bond):

		def _create_pdb_map(outfile):
		"""Create a mapping from atom numbers to lines to write to pdbqt

		This is a map from rdkit atom number to its line in the pdb
		file. We also add the two additional columns required for
		pdbqt (charge, symbol).

		Note rdkit atoms are 0 indexed and pdb files are 1 indexed

		Parameters
		----------
		outfile: str
		filename which already has a valid pdb representation of mol

		Returns
		-------
		pdb_map: dict
		Maps rdkit atom numbers to lines to be written to PDBQT file.
		"""
		lines = [x.strip() for x in open(outfile).readlines()]
		lines = filter(lambda x: x.startswith("HETATM") or x.startswith("ATOM"),
		lines)
		lines = [x[:66] for x in lines]
		pdb_map = {}
		for line in lines:
		my_values = line.split()
		atom_number = int(my_values[1])
		atom_symbol = my_values[2]
		atom_symbol = ''.join([i for i in atom_symbol if not i.isdigit()])
		line = line.replace("HETATM", "ATOM ")
		line = "%s +0.000 %s\n" % (line, atom_symbol.ljust(2))
		pdb_map[atom_number - 1] = line
		return pdb_map


		def _create_component_map(mol, components):
		"""Creates a map from atom ids to disconnected component id

		For each atom in `mol`, maps it to the id of the component in the
		molecule. The intent is that this is used on a molecule whose
		rotatable bonds have been removed. `components` is a list of the
		connected components after this surgery.

		Parameters
		----------
		mol: rdkit Mol
		molecule to find disconnected compontents in
		components: list
		List of connected components

		Returns
		-------
		comp_map: dict
		Maps atom ids to component ides
		"""
		comp_map = {}
		for i in range(mol.GetNumAtoms()):
		for j in range(len(components)):
		if i in components[j]:
		comp_map[i] = j
		break
		return comp_map


		def _dfs(used_partitions, current_partition, bond, components, rotatable_bonds,
		lines, pdb_map, comp_map):
		"""
		This function does a depth first search through the torsion tree

		Parameters
		----------
		used_partions: set
		Partitions which have already been used
		current_partition: object
		The current partition to expand
		bond: object
		the bond which goes from the previous partition into this partition
		components: list
		List of connected components
		rotatable_bonds: list
		List of rotatable bonds
		lines: list
		List of lines to write
		pdb_map: dict
		Maps atom numbers to PDBQT lines to write
		comp_map: dict
		Maps atom numbers to component numbers

		Returns
		-------
		lines: list
		List of lines to write. This has more appended lines.
		"""
		if self._get_component_for_atom(bond[1]) != current_partition:
		if comp_map[bond[1]] != current_partition:
		bond = (bond[1], bond[0])
		self.used_partitions.add(self._get_component_for_atom(bond[0]))
		self.used_partitions.add(self._get_component_for_atom(bond[1]))
		self.lines.append("BRANCH %4s %4s\n" % (bond[0] + 1, bond[1] + 1))
		for atom in self.components[current_partition]:
		self.lines.append(self._writer_line_for_atom(atom))
		for b in self.rotatable_bonds:
		valid, next_partition = self._valid_bond(b, current_partition)
		used_partitions.add(comp_map[bond[0]])
		used_partitions.add(comp_map[bond[1]])
		lines.append("BRANCH %4s %4s\n" % (bond[0] + 1, bond[1] + 1))
		for atom in components[current_partition]:
		lines.append(pdb_map[atom])
		for b in rotatable_bonds:
		valid, next_partition = _valid_bond(used_partitions, b, current_partition,
		comp_map)
		if not valid:
		continue
		self._dfs(next_partition, b)
		self.lines.append("ENDBRANCH %4s %4s\n" % (bond[0] + 1, bond[1] + 1))

		def _get_component_for_atom(self, atom_number):
		"""
		Parameters
		----------
		atom_number: int
		The atom number to check for component_id
		lines = _dfs(used_partitions, next_partition, b, components,
		rotatable_bonds, lines, pdb_map, comp_map)
		lines.append("ENDBRANCH %4s %4s\n" % (bond[0] + 1, bond[1] + 1))
		return lines

		Returns
		-------
		The component_id that atom_number is part of
		"""
		return self.comp_map[atom_number]

		def _valid_bond(self, bond, current_partition):
		"""
		def _valid_bond(used_partitions, bond, current_partition, comp_map):
		"""Helper method to find next partition to explore.

		Used to check if a bond goes from the current partition into a
		partition that is not yet explored

		Parameters
		----------
		used_partions: set
		Partitions which have already been used
		bond: object
		the bond to check if it goes to an unexplored partition
		current_partition: object
		the current partition of the DFS
		comp_map: dict
		Maps atom ids to component ids

		Returns
		-------
		is_valid, next_partition
		"""
		part1 = self.comp_map[bond[0]]
		part2 = self.comp_map[bond[1]]
		part1 = comp_map[bond[0]]
		part2 = comp_map[bond[1]]
		if part1 != current_partition and part2 != current_partition:
		return False, 0
		if part1 == current_partition:
		next_partition = part2
		else:
		next_partition = part1
		return not next_partition in self.used_partitions, next_partition

		def _writer_line_for_atom(self, atom_number):
		"""
		Parameters
		----------
		atom_number: int
		The atom number for this atom

		Returns
		-------
		The self.pdb_map lookup for this atom.
		"""
		return self.pdb_map[atom_number]

		def _create_component_map(self, components):
		"""Creates a Map From atom_idx to disconnected_component_id

		Sets self.comp_map to the computed compnent map.

		Parameters
		----------
		components: list
		List of connected components
		"""
		comp_map = {}
		for i in range(self.mol.GetNumAtoms()):
		for j in range(len(components)):
		if i in components[j]:
		comp_map[i] = j
		break
		self.comp_map = comp_map

		def _create_pdb_map(self):
		"""Create self.pdb_map.

		This is a map from rdkit atom number to its line in the pdb
		file. We also add the two additional columns required for
		pdbqt (charge, symbol)

		note rdkit atoms are 0 indexes and pdb files are 1 indexed
		"""
		lines = [x.strip() for x in open(self.outfile).readlines()]
		lines = filter(lambda x: x.startswith("HETATM") or x.startswith("ATOM"),
		lines)
		lines = [x[:66] for x in lines]
		pdb_map = {}
		for line in lines:
		my_values = line.split()
		atom_number = int(my_values[1])
		atom_symbol = my_values[2]
		atom_symbol = ''.join([i for i in atom_symbol if not i.isdigit()])
		line = line.replace("HETATM", "ATOM ")
		line = "%s +0.000 %s\n" % (line, atom_symbol.ljust(2))
		pdb_map[atom_number - 1] = line
		self.pdb_map = pdb_map

		def _mol_to_graph(self):
		"""
		Convert self.mol into a graph representation
		atoms are nodes, and bonds are vertices
		store as self.graph
		"""
		import networkx as nx
		G = nx.Graph()
		num_atoms = self.mol.GetNumAtoms()
		G.add_nodes_from(range(num_atoms))
		for i in range(self.mol.GetNumBonds()):
		from_idx = self.mol.GetBonds()[i].GetBeginAtomIdx()
		to_idx = self.mol.GetBonds()[i].GetEndAtomIdx()
		G.add_edge(from_idx, to_idx)
		self.graph = G

		def _get_rotatable_bonds(self):
		"""
		https://github.com/rdkit/rdkit/blob/f4529c910e546af590c56eba01f96e9015c269a6/Code/GraphMol/Descriptors/Lipinski.cpp#L107
		Taken from rdkit source to find which bonds are rotatable
		store rotatable bonds in (from_atom, to_atom)
		"""
		from rdkit import Chem
		from rdkit.Chem import rdmolops
		pattern = Chem.MolFromSmarts(
		"[!$(#)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])("
		"[CH3])[CH3])&!$([CD3](=[N,O,S])-!@[#7,O,S!D1])&!$([#7,O,S!D1]-!@[CD3]="
		"[N,O,S])&!$([CD3](=[N+])-!@[#7!D1])&!$([#7!D1]-!@[CD3]=[N+])]-!@[!$(*#"
		"*)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])([CH3])"
		"[CH3])]")
		rdmolops.FastFindRings(self.mol)
		self.rotatable_bonds = self.mol.GetSubstructMatches(pattern)
		return not next_partition in used_partitions, next_partition

deepchem/utils/rdkit_util.py

+2 −2

Original line number	Diff line number	Diff line
		@@ -332,9 +332,9 @@ def write_molecule(mol, outfile, is_protein=False):
		writer.write(mol)
		writer.close()
		if is_protein:
		convert_protein_to_pdbqt(mol, outfile)
		pdbqt_utils.convert_protein_to_pdbqt(mol, outfile)
		else:
		convert_mol_to_pdbqt(mol, outfile)
		pdbqt_utils.convert_mol_to_pdbqt(mol, outfile)
		elif ".pdb" in outfile:
		writer = Chem.PDBWriter(outfile)
		writer.write(mol)

deepchem/utils/test/test_pdbqt_utils.py

+46 −13

Original line number	Diff line number	Diff line
		import unittest
		import os
		import tempfile
		from nose.tools import assert_equal
		from deepchem.utils import rdkit_util
		from deepchem.utils import pdbqt_utils


		class TestPDBQTUtils(unittest.TestCase):

		def test_pdbqt_to_pdb(self):
		def setUp(self):
		current_dir = os.path.dirname(os.path.realpath(__file__))
		protein_file = os.path.join(current_dir,
		self.protein_file = os.path.join(current_dir,
		"../../dock/tests/1jld_protein.pdb")
		self.ligand_file = os.path.join(current_dir,
		"../../dock/tests/1jld_ligand.sdf")

		def test_pdbqt_to_pdb(self):
		"""Test that a PDBQT molecule can be converted back in to PDB."""
		xyz, mol = rdkit_util.load_molecule(
		protein_file, calc_charges=False, add_hydrogens=False)
		self.protein_file, calc_charges=False, add_hydrogens=False)
		with tempfile.TemporaryDirectory() as tmp:
		out_pdb = os.path.join(tmp, "mol.pdb")
		out_pdbqt = os.path.join(tmp, "mol.pdbqt")

		rdkit_util.write_molecule(mol, out_pdb)
		rdkit_util.write_molecule(mol, out_pdb, is_protein=True)
		rdkit_util.write_molecule(mol, out_pdbqt, is_protein=True)

		pdb_block = pdbqt_utils.pdbqt_to_pdb(out_pdbqt)
		@@ -33,13 +40,39 @@ class TestPDBQTUtils(unittest.TestCase):
		assert_equal(atom1.GetAtomicNum(), atom2.GetAtomicNum())

		def test_convert_mol_to_pdbqt(self):
		# TODO
		pass
		"""Test that a ligand molecule can be coverted to PDBQT."""
		from rdkit import Chem
		xyz, mol = rdkit_util.load_molecule(
		self.ligand_file, calc_charges=False, add_hydrogens=False)
		with tempfile.TemporaryDirectory() as tmp:
		outfile = os.path.join(tmp, "mol.pdbqt")
		writer = Chem.PDBWriter(outfile)
		writer.write(mol)
		writer.close()
		pdbqt_utils.convert_mol_to_pdbqt(mol, outfile)
		pdbqt_xyz, pdbqt_mol = rdkit_util.load_molecule(
		outfile, add_hydrogens=False, calc_charges=False)
		assert_equal(pdbqt_mol.GetNumAtoms(), pdbqt_mol.GetNumAtoms())
		for atom_idx in range(pdbqt_mol.GetNumAtoms()):
		atom1 = pdbqt_mol.GetAtoms()[atom_idx]
		atom2 = pdbqt_mol.GetAtoms()[atom_idx]
		assert_equal(atom1.GetAtomicNum(), atom2.GetAtomicNum())

		def test_convert_protein_to_pdbqt(self):
		# TODO
		pass

		def test_pdbqt_ligand_writer(self):
		# TODO
		pass
		"""Test a protein in a PDB can be converted to PDBQT."""
		from rdkit import Chem
		xyz, mol = rdkit_util.load_molecule(
		self.protein_file, calc_charges=False, add_hydrogens=False)
		with tempfile.TemporaryDirectory() as tmp:
		outfile = os.path.join(tmp, "mol.pdbqt")
		writer = Chem.PDBWriter(outfile)
		writer.write(mol)
		writer.close()
		pdbqt_utils.convert_protein_to_pdbqt(mol, outfile)
		pdbqt_xyz, pdbqt_mol = rdkit_util.load_molecule(
		outfile, add_hydrogens=False, calc_charges=False)
		assert_equal(pdbqt_mol.GetNumAtoms(), pdbqt_mol.GetNumAtoms())
		for atom_idx in range(pdbqt_mol.GetNumAtoms()):
		atom1 = pdbqt_mol.GetAtoms()[atom_idx]
		atom2 = pdbqt_mol.GetAtoms()[atom_idx]
		assert_equal(atom1.GetAtomicNum(), atom2.GetAtomicNum())

Admin message