Commit a15ddcc7 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Cleanup pdbqt utils

parent 34ac5e6c
Loading
Loading
Loading
Loading
+248 −215
Original line number Diff line number Diff line
@@ -30,22 +30,11 @@ def pdbqt_to_pdb(filename=None, pdbqt_data=None):
  return pdb_block


def convert_mol_to_pdbqt(mol, outfile):
  """Convert a rdkit molecule into a pdbqt ligand

  Parameters
  ----------
  mol: rdkit Mol
    Ligand molecule
  outfile: str
    filename which already has a valid pdb representation of mol
  """
  PdbqtLigandWriter(mol, outfile).convert()


def convert_protein_to_pdbqt(mol, outfile):
  """Convert a protein PDB file into a pdbqt file.

  Writes the extra PDBQT terms directly to `outfile`.

  Parameters
  ----------
  mol: rdkit Mol
@@ -71,19 +60,73 @@ def convert_protein_to_pdbqt(mol, outfile):
      fout.write(line)


class PdbqtLigandWriter(object):
def mol_to_graph(mol):
  """Convert RDKit Mol to NetworkX graph

  Convert mol into a graph representation atoms are nodes, and bonds
  are vertices stored as graph

  Parameters
  ----------
  mol: rdkit Mol
    The molecule to convert into a graph. 

  Returns
  -------
  graph: networkx.Graph
    Contains atoms indices as nodes, edges as bonds.
  """
  Create a torsion tree and write to pdbqt file
  import networkx as nx
  G = nx.Graph()
  num_atoms = mol.GetNumAtoms()
  G.add_nodes_from(range(num_atoms))
  for i in range(mol.GetNumBonds()):
    from_idx = mol.GetBonds()[i].GetBeginAtomIdx()
    to_idx = mol.GetBonds()[i].GetEndAtomIdx()
    G.add_edge(from_idx, to_idx)
  return G

  The torsion tree represents rotatable bonds in the molecule.

  Note
  ----
  This class requires RDKit to be installed.
def get_rotatable_bonds(mol):
  """
  https://github.com/rdkit/rdkit/blob/f4529c910e546af590c56eba01f96e9015c269a6/Code/GraphMol/Descriptors/Lipinski.cpp#L107

  Taken from rdkit source to find which bonds are rotatable store
  rotatable bonds in (from_atom, to_atom)

  Parameters
  ----------
  mol: rdkit Mol
    Ligand molecule

  def __init__(self, mol, outfile):
  Returns
  -------
  rotatable_bonds: list
    List of rotatable bonds in molecule
  """
  from rdkit import Chem
  from rdkit.Chem import rdmolops
  pattern = Chem.MolFromSmarts(
      "[!$(*#*)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])("
      "[CH3])[CH3])&!$([CD3](=[N,O,S])-!@[#7,O,S!D1])&!$([#7,O,S!D1]-!@[CD3]="
      "[N,O,S])&!$([CD3](=[N+])-!@[#7!D1])&!$([#7!D1]-!@[CD3]=[N+])]-!@[!$(*#"
      "*)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])([CH3])"
      "[CH3])]")
  rdmolops.FastFindRings(mol)
  rotatable_bonds = mol.GetSubstructMatches(pattern)
  return rotatable_bonds


def convert_mol_to_pdbqt(mol, outfile):
  """Writes the provided ligand molecule to specified file in pdbqt format.

  Creates a torsion tree and write to pdbqt file. The torsion tree
  represents rotatable bonds in the molecule. 

  Note
  ----
  This function requires RDKit to be installed.

  Parameters
  ----------
  mol: rdkit Mol
@@ -91,192 +134,182 @@ class PdbqtLigandWriter(object):
  outfile: str
    Filename for a valid pdb file with the extention .pdbqt
  """
    self.mol = mol
    self.outfile = outfile

  def convert(self):
    """Writes a PDBQT file for `self.mol`.

    The single public function of this class.  It converts a molecule
    and a pdb file into a pdbqt file stored in outfile
    """
  import networkx as nx
    self._create_pdb_map()
    self._mol_to_graph()
    self._get_rotatable_bonds()

    for bond in self.rotatable_bonds:
      self.graph.remove_edge(bond[0], bond[1])
    self.components = [x for x in nx.connected_components(self.graph)]
    self._create_component_map(self.components)

    self.used_partitions = set()
    self.lines = []
    root = max(enumerate(self.components), key=lambda x: len(x[1]))[0]
    self.lines.append("ROOT\n")
    for atom in self.components[root]:
      self.lines.append(self._writer_line_for_atom(atom))
    self.lines.append("ENDROOT\n")
    self.used_partitions.add(root)
    for bond in self.rotatable_bonds:
      valid, next_partition = self._valid_bond(bond, root)
  # Walk through the original file and extract ATOM/HETATM lines and
  # add PDBQT charge annotations.
  pdb_map = _create_pdb_map(outfile)
  graph = mol_to_graph(mol)
  rotatable_bonds = get_rotatable_bonds(mol)

  # Remove rotatable bonds from this molecule
  for bond in rotatable_bonds:
    graph.remove_edge(bond[0], bond[1])
  # Get the connected components now that the rotatable bonds have
  # been removed.
  components = [x for x in nx.connected_components(graph)]
  comp_map = _create_component_map(mol, components)

  used_partitions = set()
  lines = []
  # The root is the largest connected component.
  root = max(enumerate(components), key=lambda x: len(x[1]))[0]
  # Write the root component
  lines.append("ROOT\n")
  for atom in components[root]:
    lines.append(pdb_map[atom])
  lines.append("ENDROOT\n")
  # We've looked at the root, so take note of that
  used_partitions.add(root)
  for bond in rotatable_bonds:
    valid, next_partition = _valid_bond(used_partitions, bond, root, comp_map)
    if not valid:
      continue
      self._dfs(next_partition, bond)
    self.lines.append("TORSDOF %s" % len(self.rotatable_bonds))
    with open(self.outfile, 'w') as fout:
      for line in self.lines:
    _dfs(used_partitions, next_partition, bond, components, rotatable_bonds,
         lines, pdb_map, comp_map)
  lines.append("TORSDOF %s" % len(rotatable_bonds))
  with open(outfile, 'w') as fout:
    for line in lines:
      fout.write(line)

  def _dfs(self, current_partition, bond):

def _create_pdb_map(outfile):
  """Create a mapping from atom numbers to lines to write to pdbqt

  This is a map from rdkit atom number to its line in the pdb
  file. We also add the two additional columns required for
  pdbqt (charge, symbol).

  Note rdkit atoms are 0 indexed and pdb files are 1 indexed

  Parameters
  ----------
  outfile: str
    filename which already has a valid pdb representation of mol

  Returns
  -------
  pdb_map: dict
    Maps rdkit atom numbers to lines to be written to PDBQT file.
  """
  lines = [x.strip() for x in open(outfile).readlines()]
  lines = filter(lambda x: x.startswith("HETATM") or x.startswith("ATOM"),
                 lines)
  lines = [x[:66] for x in lines]
  pdb_map = {}
  for line in lines:
    my_values = line.split()
    atom_number = int(my_values[1])
    atom_symbol = my_values[2]
    atom_symbol = ''.join([i for i in atom_symbol if not i.isdigit()])
    line = line.replace("HETATM", "ATOM  ")
    line = "%s    +0.000 %s\n" % (line, atom_symbol.ljust(2))
    pdb_map[atom_number - 1] = line
  return pdb_map


def _create_component_map(mol, components):
  """Creates a map from atom ids to disconnected component id

  For each atom in `mol`, maps it to the id of the component in the
  molecule. The intent is that this is used on a molecule whose
  rotatable bonds have been removed. `components` is a list of the
  connected components after this surgery.

  Parameters
  ----------
  mol: rdkit Mol
    molecule to find disconnected compontents in 
  components: list
    List of connected components

  Returns
  -------
  comp_map: dict
    Maps atom ids to component ides
  """
  comp_map = {}
  for i in range(mol.GetNumAtoms()):
    for j in range(len(components)):
      if i in components[j]:
        comp_map[i] = j
        break
  return comp_map


def _dfs(used_partitions, current_partition, bond, components, rotatable_bonds,
         lines, pdb_map, comp_map):
  """
  This function does a depth first search through the torsion tree

  Parameters
  ----------
  used_partions: set
    Partitions which have already been used
  current_partition: object
    The current partition to expand
  bond: object
    the bond which goes from the previous partition into this partition
  components: list
    List of connected components
  rotatable_bonds: list
    List of rotatable bonds
  lines: list
    List of lines to write
  pdb_map: dict
    Maps atom numbers to PDBQT lines to write
  comp_map: dict
    Maps atom numbers to component numbers

  Returns
  -------
  lines: list
    List of lines to write. This has more appended lines.
  """
    if self._get_component_for_atom(bond[1]) != current_partition:
  if comp_map[bond[1]] != current_partition:
    bond = (bond[1], bond[0])
    self.used_partitions.add(self._get_component_for_atom(bond[0]))
    self.used_partitions.add(self._get_component_for_atom(bond[1]))
    self.lines.append("BRANCH %4s %4s\n" % (bond[0] + 1, bond[1] + 1))
    for atom in self.components[current_partition]:
      self.lines.append(self._writer_line_for_atom(atom))
    for b in self.rotatable_bonds:
      valid, next_partition = self._valid_bond(b, current_partition)
  used_partitions.add(comp_map[bond[0]])
  used_partitions.add(comp_map[bond[1]])
  lines.append("BRANCH %4s %4s\n" % (bond[0] + 1, bond[1] + 1))
  for atom in components[current_partition]:
    lines.append(pdb_map[atom])
  for b in rotatable_bonds:
    valid, next_partition = _valid_bond(used_partitions, b, current_partition,
                                        comp_map)
    if not valid:
      continue
      self._dfs(next_partition, b)
    self.lines.append("ENDBRANCH %4s %4s\n" % (bond[0] + 1, bond[1] + 1))

  def _get_component_for_atom(self, atom_number):
    """
    Parameters
    ----------
    atom_number: int
      The atom number to check for component_id
    lines = _dfs(used_partitions, next_partition, b, components,
                 rotatable_bonds, lines, pdb_map, comp_map)
  lines.append("ENDBRANCH %4s %4s\n" % (bond[0] + 1, bond[1] + 1))
  return lines

    Returns
    -------
    The component_id that atom_number is part of
    """
    return self.comp_map[atom_number]

  def _valid_bond(self, bond, current_partition):
    """
def _valid_bond(used_partitions, bond, current_partition, comp_map):
  """Helper method to find next partition to explore.

  Used to check if a bond goes from the current partition into a
  partition that is not yet explored

  Parameters
  ----------
  used_partions: set
    Partitions which have already been used
  bond: object
    the bond to check if it goes to an unexplored partition
  current_partition: object
    the current partition of the DFS
  comp_map: dict
    Maps atom ids to component ids

  Returns
  -------
  is_valid, next_partition
  """
    part1 = self.comp_map[bond[0]]
    part2 = self.comp_map[bond[1]]
  part1 = comp_map[bond[0]]
  part2 = comp_map[bond[1]]
  if part1 != current_partition and part2 != current_partition:
    return False, 0
  if part1 == current_partition:
    next_partition = part2
  else:
    next_partition = part1
    return not next_partition in self.used_partitions, next_partition

  def _writer_line_for_atom(self, atom_number):
    """
    Parameters
    ----------
    atom_number: int
      The atom number for this atom

    Returns
    -------
    The self.pdb_map lookup for this atom.
    """
    return self.pdb_map[atom_number]

  def _create_component_map(self, components):
    """Creates a Map From atom_idx to disconnected_component_id

    Sets self.comp_map to the computed compnent map.

    Parameters
    ----------
    components: list
      List of connected components
    """
    comp_map = {}
    for i in range(self.mol.GetNumAtoms()):
      for j in range(len(components)):
        if i in components[j]:
          comp_map[i] = j
          break
    self.comp_map = comp_map

  def _create_pdb_map(self):
    """Create self.pdb_map.

    This is a map from rdkit atom number to its line in the pdb
    file. We also add the two additional columns required for
    pdbqt (charge, symbol)

    note rdkit atoms are 0 indexes and pdb files are 1 indexed
    """
    lines = [x.strip() for x in open(self.outfile).readlines()]
    lines = filter(lambda x: x.startswith("HETATM") or x.startswith("ATOM"),
                   lines)
    lines = [x[:66] for x in lines]
    pdb_map = {}
    for line in lines:
      my_values = line.split()
      atom_number = int(my_values[1])
      atom_symbol = my_values[2]
      atom_symbol = ''.join([i for i in atom_symbol if not i.isdigit()])
      line = line.replace("HETATM", "ATOM  ")
      line = "%s    +0.000 %s\n" % (line, atom_symbol.ljust(2))
      pdb_map[atom_number - 1] = line
    self.pdb_map = pdb_map

  def _mol_to_graph(self):
    """
    Convert self.mol into a graph representation
    atoms are nodes, and bonds are vertices
    store as self.graph
    """
    import networkx as nx
    G = nx.Graph()
    num_atoms = self.mol.GetNumAtoms()
    G.add_nodes_from(range(num_atoms))
    for i in range(self.mol.GetNumBonds()):
      from_idx = self.mol.GetBonds()[i].GetBeginAtomIdx()
      to_idx = self.mol.GetBonds()[i].GetEndAtomIdx()
      G.add_edge(from_idx, to_idx)
    self.graph = G

  def _get_rotatable_bonds(self):
    """
    https://github.com/rdkit/rdkit/blob/f4529c910e546af590c56eba01f96e9015c269a6/Code/GraphMol/Descriptors/Lipinski.cpp#L107
    Taken from rdkit source to find which bonds are rotatable
    store rotatable bonds in (from_atom, to_atom)
    """
    from rdkit import Chem
    from rdkit.Chem import rdmolops
    pattern = Chem.MolFromSmarts(
        "[!$(*#*)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])("
        "[CH3])[CH3])&!$([CD3](=[N,O,S])-!@[#7,O,S!D1])&!$([#7,O,S!D1]-!@[CD3]="
        "[N,O,S])&!$([CD3](=[N+])-!@[#7!D1])&!$([#7!D1]-!@[CD3]=[N+])]-!@[!$(*#"
        "*)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])([CH3])"
        "[CH3])]")
    rdmolops.FastFindRings(self.mol)
    self.rotatable_bonds = self.mol.GetSubstructMatches(pattern)
  return not next_partition in used_partitions, next_partition
+2 −2
Original line number Diff line number Diff line
@@ -332,9 +332,9 @@ def write_molecule(mol, outfile, is_protein=False):
    writer.write(mol)
    writer.close()
    if is_protein:
      convert_protein_to_pdbqt(mol, outfile)
      pdbqt_utils.convert_protein_to_pdbqt(mol, outfile)
    else:
      convert_mol_to_pdbqt(mol, outfile)
      pdbqt_utils.convert_mol_to_pdbqt(mol, outfile)
  elif ".pdb" in outfile:
    writer = Chem.PDBWriter(outfile)
    writer.write(mol)
+46 −13
Original line number Diff line number Diff line
import unittest
import os
import tempfile
from nose.tools import assert_equal
from deepchem.utils import rdkit_util
from deepchem.utils import pdbqt_utils


class TestPDBQTUtils(unittest.TestCase):

  def test_pdbqt_to_pdb(self):
  def setUp(self):
    current_dir = os.path.dirname(os.path.realpath(__file__))
    protein_file = os.path.join(current_dir,
    self.protein_file = os.path.join(current_dir,
                                     "../../dock/tests/1jld_protein.pdb")
    self.ligand_file = os.path.join(current_dir,
                                    "../../dock/tests/1jld_ligand.sdf")

  def test_pdbqt_to_pdb(self):
    """Test that a PDBQT molecule can be converted back in to PDB."""
    xyz, mol = rdkit_util.load_molecule(
        protein_file, calc_charges=False, add_hydrogens=False)
        self.protein_file, calc_charges=False, add_hydrogens=False)
    with tempfile.TemporaryDirectory() as tmp:
      out_pdb = os.path.join(tmp, "mol.pdb")
      out_pdbqt = os.path.join(tmp, "mol.pdbqt")

      rdkit_util.write_molecule(mol, out_pdb)
      rdkit_util.write_molecule(mol, out_pdb, is_protein=True)
      rdkit_util.write_molecule(mol, out_pdbqt, is_protein=True)

      pdb_block = pdbqt_utils.pdbqt_to_pdb(out_pdbqt)
@@ -33,13 +40,39 @@ class TestPDBQTUtils(unittest.TestCase):
      assert_equal(atom1.GetAtomicNum(), atom2.GetAtomicNum())

  def test_convert_mol_to_pdbqt(self):
    # TODO
    pass
    """Test that a ligand molecule can be coverted to PDBQT."""
    from rdkit import Chem
    xyz, mol = rdkit_util.load_molecule(
        self.ligand_file, calc_charges=False, add_hydrogens=False)
    with tempfile.TemporaryDirectory() as tmp:
      outfile = os.path.join(tmp, "mol.pdbqt")
      writer = Chem.PDBWriter(outfile)
      writer.write(mol)
      writer.close()
      pdbqt_utils.convert_mol_to_pdbqt(mol, outfile)
      pdbqt_xyz, pdbqt_mol = rdkit_util.load_molecule(
          outfile, add_hydrogens=False, calc_charges=False)
    assert_equal(pdbqt_mol.GetNumAtoms(), pdbqt_mol.GetNumAtoms())
    for atom_idx in range(pdbqt_mol.GetNumAtoms()):
      atom1 = pdbqt_mol.GetAtoms()[atom_idx]
      atom2 = pdbqt_mol.GetAtoms()[atom_idx]
      assert_equal(atom1.GetAtomicNum(), atom2.GetAtomicNum())

  def test_convert_protein_to_pdbqt(self):
    # TODO
    pass

  def test_pdbqt_ligand_writer(self):
    # TODO
    pass
    """Test a protein in a PDB can be converted to PDBQT."""
    from rdkit import Chem
    xyz, mol = rdkit_util.load_molecule(
        self.protein_file, calc_charges=False, add_hydrogens=False)
    with tempfile.TemporaryDirectory() as tmp:
      outfile = os.path.join(tmp, "mol.pdbqt")
      writer = Chem.PDBWriter(outfile)
      writer.write(mol)
      writer.close()
      pdbqt_utils.convert_protein_to_pdbqt(mol, outfile)
      pdbqt_xyz, pdbqt_mol = rdkit_util.load_molecule(
          outfile, add_hydrogens=False, calc_charges=False)
    assert_equal(pdbqt_mol.GetNumAtoms(), pdbqt_mol.GetNumAtoms())
    for atom_idx in range(pdbqt_mol.GetNumAtoms()):
      atom1 = pdbqt_mol.GetAtoms()[atom_idx]
      atom2 = pdbqt_mol.GetAtoms()[atom_idx]
      assert_equal(atom1.GetAtomicNum(), atom2.GetAtomicNum())