Commit d54147be authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

changes

parent 810ae59b
Loading
Loading
Loading
Loading
+357 −255
Original line number Diff line number Diff line
import logging
"""
RDKit Utilities.

import numpy as np
import os
This file contains utilities that compute useful properties of
molecules. Some of these are simple cleanup utilities, and
others are more sophisticated functions that detect chemical
properties of molecules.
"""

try:
  from StringIO import StringIO
except ImportError:
import os
import logging
import itertools
import numpy as np
from io import StringIO
from copy import deepcopy
from collections import Counter
from deepchem.utils import pdbqt_utils
from deepchem.utils.pdbqt_utils import convert_mol_to_pdbqt
from deepchem.utils.pdbqt_utils import convert_protein_to_pdbqt
from deepchem.utils.geometry_utils import angle_between
from deepchem.utils.geometry_utils import is_angle_within_cutoff
from deepchem.utils.geometry_utils import generate_random_rotation_matrix

logger = logging.getLogger(__name__)

class MoleculeLoadException(Exception):

@@ -16,9 +30,18 @@ class MoleculeLoadException(Exception):


def get_xyz_from_mol(mol):
  """
  returns an m x 3 np array of 3d coords
  of given rdkit molecule
  """Extracts a numpy array of coordinates from a molecules.

  Returns a `(N, 3)` numpy array of 3d coords of given rdkit molecule

  Parameters
  ----------
  mol: rdkit Molecule
    Molecule to extract coordinates for

  Returns
  -------
  Numpy ndarray of shape `(N, 3)` where `N = mol.GetNumAtoms()`.
  """
  xyz = np.zeros((mol.GetNumAtoms(), 3))
  conf = mol.GetConformer()
@@ -29,13 +52,55 @@ def get_xyz_from_mol(mol):
    xyz[i, 2] = position.z
  return (xyz)


def add_hydrogens_to_mol(mol):
  """
  Add hydrogens to a molecule object
  TODO (LESWING) see if there are more flags to add here for default
  :param mol: Rdkit Mol
  :return: Rdkit Mol

  Parameters
  ----------
  mol: Rdkit Mol
    Molecule to hydrogenate

  Returns
  -------
  Rdkit Mol

  Note
  ----
  This function requires RDKit and PDBFixer to be installed.
  """
  return apply_pdbfixer(mol, hydrogenate=True)


def apply_pdbfixer(mol, add_missing=True, hydrogenate=True, pH=7.4,
                   remove_heterogens=True, is_protein=True):
  """
  Apply PDBFixer to a molecule to try to clean it up.

  Parameters
  ----------
  mol: Rdkit Mol
    Molecule to hydrogenate
  add_missing: bool, optional
    If true, add in missing residues and atoms
  hydrogenate: bool, optional
    If true, add hydrogens at specified pH
  pH: float, optional
    The pH at which hydrogens will be added if `hydrogenate==True`. Set to 7.4 by default.
  remove_heterogens: bool, optional
    Often times, PDB files come with extra waters and salts attached.
    If this field is set, remove these heterogens.
  is_protein: bool, optional
    If false, then don't remove heterogens (since this molecule is
    itself a heterogen).
  
  Returns
  -------
  Rdkit Mol

  Note
  ----
  This function requires RDKit and PDBFixer to be installed.
  """
  molecule_file = None
  try:
@@ -46,10 +111,15 @@ def add_hydrogens_to_mol(mol):
    pdb_stringio.seek(0)
    import pdbfixer
    fixer = pdbfixer.PDBFixer(pdbfile=pdb_stringio)
    if add_missing:
      fixer.findMissingResidues()
      fixer.findMissingAtoms()
      fixer.addMissingAtoms()
    fixer.addMissingHydrogens(7.4)
    if hydrogenate:
      fixer.addMissingHydrogens(pH)
    if is_protein and remove_heterogens:
      # False here specifies that water is to be removed
      fixer.removeHeterogens(False)

    hydrogenated_io = StringIO()
    import simtk
@@ -59,7 +129,7 @@ def add_hydrogens_to_mol(mol):
    return Chem.MolFromPDBBlock(
        hydrogenated_io.read(), sanitize=False, removeHs=False)
  except ValueError as e:
    logging.warning("Unable to add hydrogens %s", e)
    logger.warning("Unable to add hydrogens %s", e)
    raise MoleculeLoadException(e)
  finally:
    try:
@@ -69,10 +139,10 @@ def add_hydrogens_to_mol(mol):


def compute_charges(mol):
  """
  Attempt to compute Gasteiger Charges on Mol
  This also has the side effect of calculating charges on mol.
  The mol passed into this function has to already have been sanitized
  """Attempt to compute Gasteiger Charges on Mol

  This also has the side effect of calculating charges on mol.  The
  mol passed into this function has to already have been sanitized

  Params
  ------
@@ -80,44 +150,125 @@ def compute_charges(mol):

  Returns
  -------
  molecule with charges
  No return since updates in place.
  
  Note
  ----
  This function requires RDKit to be installed.
  """
  from rdkit.Chem import AllChem
  try:
    # Updates charges in place
    AllChem.ComputeGasteigerCharges(mol)
  except Exception as e:
    logging.exception("Unable to compute charges for mol")
    raise MoleculeLoadException(e)
  return mol

def load_complex(molecular_complex,
                 add_hydrogens=True,
                 calc_charges=True,
                 pdbfix=True,
                 sanitize=True):
  """Loads a molecular complex.

  Given some representation of a molecular complex, returns a list of
  tuples, where each tuple contains (xyz coords, rdkit object) for
  that constituent molecule in the complex.

  For now, assumes that molecular_complex is a tuple of filenames.

  Parameters
  ----------
  molecular_complex: list or str
    If list, each entry should be a filename for a constituent
    molecule in complex. If str, should be the filename of a file that
    holds the full complex.
  add_hydrogens: bool, optional
    If true, add hydrogens via pdbfixer
  calc_charges: bool, optional
    If true, add charges via rdkit
  pdbfix: bool, optional
    If true, apply pdbfixer to clean up this molecule.
  sanitize: bool, optional
    If true, sanitize molecules via rdkit

  Returns
  -------
  List of tuples (xyz, mol)

  Note
  ----
  This function requires RDKit to be installed.
  """
  if isinstance(molecular_complex, str):
    molecule_complex = [molecular_complex]
  fragments = []
  for mol in molecular_complex:
    loaded = load_molecule(mol,
                           add_hydrogens=add_hydrogens,
                           calc_charges=calc_charges,
                           pdbfix=pdbfix,
                           sanitize=sanitize)
    if isinstance(loaded, list):
      fragments += loaded
    else:
      fragments.append(loaded)
  return fragments
    


def load_molecule(molecule_file,
                  add_hydrogens=True,
                  calc_charges=True,
                  sanitize=False):
  """
  Converts molecule file to (xyz-coords, obmol object)
                  sanitize=True,
                  pdbfix=True):
  """Converts molecule file to (xyz-coords, obmol object)

  Given molecule_file, returns a tuple of xyz coords of molecule
  and an rdkit object representing that molecule
  :param molecule_file: filename for molecule
  :param add_hydrogens: should add hydrogens via pdbfixer?
  :param calc_charges: should add charges vis rdkit
  :return: (xyz, mol)
  and an rdkit object representing that molecule in that order `(xyz,
  rdkit_mol)`. This ordering convention is used in the code in a few
  places.

  Parameters
  ----------
  molecule_file: str
    filename for molecule
  add_hydrogens: bool, optional
    If true, add hydrogens via pdbfixer
  calc_charges: bool, optional
    If true, add charges via rdkit
  sanitize: bool, optional
    If true, sanitize molecules via rdkit
  pdbfix: bool, optional
    If true, apply pdbfixer to clean up this molecule.

  Returns
  -------
  Tuple (xyz, mol) if file contains single molecule. Else returns a
  list of the tuples for the separate molecules in this list.

  Note
  ----
  This function requires RDKit to be installed.
  """
  from rdkit import Chem
  from rdkit.Chem.rdchem import AtomValenceException
  from_pdb = False
  if ".mol2" in molecule_file:
    my_mol = Chem.MolFromMol2File(molecule_file, sanitize=False, removeHs=False)
  elif ".sdf" in molecule_file:
    suppl = Chem.SDMolSupplier(str(molecule_file), sanitize=False)
    # TODO: This is wrong. Should return all molecules
    my_mol = suppl[0]
  elif ".pdbqt" in molecule_file:
    pdb_block = pdbqt_to_pdb(molecule_file)
    pdb_block = pdbqt_utils.pdbqt_to_pdb(molecule_file)
    my_mol = Chem.MolFromPDBBlock(
        str(pdb_block), sanitize=False, removeHs=False)
    from_pdb = True
  elif ".pdb" in molecule_file:
    my_mol = Chem.MolFromPDBFile(
        str(molecule_file), sanitize=False, removeHs=False)
    from_pdb = True
  else:
    raise ValueError("Unrecognized file type")

@@ -125,12 +276,15 @@ def load_molecule(molecule_file,
    raise ValueError("Unable to read non None Molecule Object")

  if add_hydrogens or calc_charges:
    my_mol = add_hydrogens_to_mol(my_mol)
  # TODO: mol should be always sanitized when charges are calculated
  # can't change it now because it would break a lot of examples
    # We assume if it's from a PDB, it should be a protein
    my_mol = apply_pdbfixer(my_mol, hydrogenate=add_hydrogens, is_protein=from_pdb)
  if sanitize:
    try:
      Chem.SanitizeMol(my_mol)
    except AtomValenceException:
      logger.warn("Mol %s failed valence check" % Chem.MolToSmiles(my_mol))
  if calc_charges:
    # This updates in place
    compute_charges(my_mol)

  xyz = get_xyz_from_mol(my_mol)
@@ -138,45 +292,28 @@ def load_molecule(molecule_file,
  return xyz, my_mol


def pdbqt_file_hack_protein(mol, outfile):
  """
  Hack to convert a pdb protein into a pdbqt protein
  :param mol: rdkit Mol of protein
  :param outfile: filename which already has a valid pdb representation of mol
  """
  lines = [x.strip() for x in open(outfile).readlines()]
  out_lines = []
  for line in lines:
    if "ROOT" in line or "ENDROOT" in line or "TORSDOF" in line:
      out_lines.append("%s\n" % line)
      continue
    if not line.startswith("ATOM"):
      continue
    line = line[:66]
    atom_index = int(line.split()[1])
    atom = mol.GetAtoms()[atom_index - 1]
    line = "%s    +0.000 %s\n" % (line, atom.GetSymbol().ljust(2))
    out_lines.append(line)
  with open(outfile, 'w') as fout:
    for line in out_lines:
      fout.write(line)
def write_molecule(mol, outfile, is_protein=False):
  """Write molecule to a file

  This function writes a representation of the provided molecule to
  the specified `outfile`. Doesn't return anything.

def pdbqt_file_hack_ligand(mol, outfile):
  """
  Hack to convert a pdb ligand into a pdbqt ligand
  :param mol: rdkit Mol Object
  :param outfile: filename which already has a valid pdb representation of mol
  """
  PdbqtLigandWriter(mol, outfile).convert()
  Parameters
  ----------
  mol: rdkit Mol
    Molecule to write
  outfile: str
    Filename to write mol to
  is_protein: bool, optional
    Is this molecule a protein?

  Note
  ----
  This function requires RDKit to be installed.

def write_molecule(mol, outfile, is_protein=False):
  """
   Write molecule to a file
  :param mol: rdkit Mol object
  :param outfile: filename to write mol to
  :param is_protein: is this molecule a protein?
  Raises
  ------
  ValueError: if `outfile` isn't of a supported format.
  """
  from rdkit import Chem
  if ".pdbqt" in outfile:
@@ -184,9 +321,9 @@ def write_molecule(mol, outfile, is_protein=False):
    writer.write(mol)
    writer.close()
    if is_protein:
      pdbqt_file_hack_protein(mol, outfile)
      convert_protein_to_pdbqt(mol, outfile)
    else:
      pdbqt_file_hack_ligand(mol, outfile)
      convert_mol_to_pdbqt(mol, outfile)
  elif ".pdb" in outfile:
    writer = Chem.PDBWriter(outfile)
    writer.write(mol)
@@ -199,192 +336,157 @@ def write_molecule(mol, outfile, is_protein=False):
    raise ValueError("Unsupported Format")


def pdbqt_to_pdb(filename):
  pdbqt_data = open(filename).readlines()
  pdb_block = ""
  for line in pdbqt_data:
    pdb_block += "%s\n" % line[:66]
  return pdb_block


def merge_molecules_xyz(protein_xyz, ligand_xyz):
  """Merges coordinates of protein and ligand.
  """
  return np.array(np.vstack(np.vstack((protein_xyz, ligand_xyz))))


def merge_molecules(ligand, protein):
  """Helper method to merge ligand and protein molecules."""
  from rdkit.Chem import rdmolops
  return rdmolops.CombineMols(ligand, protein)


class PdbqtLigandWriter(object):
  """
  Create a torsion tree and write to pdbqt file
  """
def merge_molecules_xyz(xyzs):
  """Merges coordinates of multiple molecules. 

  def __init__(self, mol, outfile):
  Parameters
  ----------
  xyzs: List
    List of numpy arrays each of shape `(N_i, 3)` where `N_i` is the number of atoms in the i-th atom.
  """
    :param mol: The molecule whose value is stored in pdb format in outfile
    :param outfile: a valid pdb file with the extention .pdbqt
    """
    self.mol = mol
    self.outfile = outfile
  return np.array(np.vstack(np.vstack(xyzs)))

  def convert(self):
    """
    The single public function of this class.
    It converts a molecule and a pdb file into a pdbqt file stored in outfile
    """
    import networkx as nx
    self._create_pdb_map()
    self._mol_to_graph()
    self._get_rotatable_bonds()

    for bond in self.rotatable_bonds:
      self.graph.remove_edge(bond[0], bond[1])
    self.components = [x for x in nx.connected_components(self.graph)]
    self._create_component_map(self.components)

    self.used_partitions = set()
    self.lines = []
    root = max(enumerate(self.components), key=lambda x: len(x[1]))[0]
    self.lines.append("ROOT\n")
    for atom in self.components[root]:
      self.lines.append(self._writer_line_for_atom(atom))
    self.lines.append("ENDROOT\n")
    self.used_partitions.add(root)
    for bond in self.rotatable_bonds:
      valid, next_partition = self._valid_bond(bond, root)
      if not valid:
        continue
      self._dfs(next_partition, bond)
    self.lines.append("TORSDOF %s" % len(self.rotatable_bonds))
    with open(self.outfile, 'w') as fout:
      for line in self.lines:
        fout.write(line)

  def _dfs(self, current_partition, bond):
    """
    This function does a depth first search throught he torsion tree
    :param current_partition: The current partition to expand
    :param bond: the bond which goes from the previous partition into this partition
    """
    if self._get_component_for_atom(bond[1]) != current_partition:
      bond = (bond[1], bond[0])
    self.used_partitions.add(self._get_component_for_atom(bond[0]))
    self.used_partitions.add(self._get_component_for_atom(bond[1]))
    self.lines.append("BRANCH %4s %4s\n" % (bond[0] + 1, bond[1] + 1))
    for atom in self.components[current_partition]:
      self.lines.append(self._writer_line_for_atom(atom))
    for b in self.rotatable_bonds:
      valid, next_partition = self._valid_bond(b, current_partition)
      if not valid:
        continue
      self._dfs(next_partition, b)
    self.lines.append("ENDBRANCH %4s %4s\n" % (bond[0] + 1, bond[1] + 1))
def merge_molecules(molecules):
  """Helper method to merge two molecules.

  def _get_component_for_atom(self, atom_number):
    """
    :param atom_number: the atom number to check for component_id
    :return: the component_id that atom_number is part of
    """
    return self.comp_map[atom_number]
  Parameters
  ----------
  molecules: list
    List of rdkit molecules

  def _valid_bond(self, bond, current_partition):
    """
    used to check if a bond goes from the current partition into a partition
    that is not yet explored
    :param bond: the bond to check if it goes to an unexplored partition
    :param current_partition: the current partition of the DFS
    :return: is_valid, next_partition
  Returns
  -------
  merged: rdkit molecule
  """
    part1 = self.comp_map[bond[0]]
    part2 = self.comp_map[bond[1]]
    if part1 != current_partition and part2 != current_partition:
      return False, 0
    if part1 == current_partition:
      next_partition = part2
  from rdkit.Chem import rdmolops
  if len(molecules) == 0:
    return None
  elif len(molecules) == 1:
    return molecules[0]
  else:
      next_partition = part1
    return not next_partition in self.used_partitions, next_partition

  def _writer_line_for_atom(self, atom_number):
    """

    :param atom_number:
    :return:
    """
    return self.pdb_map[atom_number]

  def _create_component_map(self, components):
    """
    Creates a Map From atom_idx to disconnected_component_id
    :param components:
    :return:
    """
    comp_map = {}
    for i in range(self.mol.GetNumAtoms()):
      for j in range(len(components)):
        if i in components[j]:
          comp_map[i] = j
          break
    self.comp_map = comp_map

  def _create_pdb_map(self):
    """
    create self.pdb_map.  This is a map from rdkit atom number to
    its line in the pdb file.  We also add the two additional columns
    required for pdbqt (charge, symbol)

    note rdkit atoms are 0 indexes and pdb files are 1 indexed
    :return:
    """
    lines = [x.strip() for x in open(self.outfile).readlines()]
    lines = filter(lambda x: x.startswith("HETATM") or x.startswith("ATOM"),
                   lines)
    lines = [x[:66] for x in lines]
    pdb_map = {}
    for line in lines:
      my_values = line.split()
      atom_number = int(my_values[1])
      atom_symbol = my_values[2]
      atom_symbol = ''.join([i for i in atom_symbol if not i.isdigit()])
      line = line.replace("HETATM", "ATOM  ")
      line = "%s    +0.000 %s\n" % (line, atom_symbol.ljust(2))
      pdb_map[atom_number - 1] = line
    self.pdb_map = pdb_map

  def _mol_to_graph(self):
    """
    Convert self.mol into a graph representation
    atoms are nodes, and bonds are vertices
    store as self.graph
    """
    import networkx as nx
    G = nx.Graph()
    num_atoms = self.mol.GetNumAtoms()
    G.add_nodes_from(range(num_atoms))
    for i in range(self.mol.GetNumBonds()):
      from_idx = self.mol.GetBonds()[i].GetBeginAtomIdx()
      to_idx = self.mol.GetBonds()[i].GetEndAtomIdx()
      G.add_edge(from_idx, to_idx)
    self.graph = G

  def _get_rotatable_bonds(self):
    """
    https://github.com/rdkit/rdkit/blob/f4529c910e546af590c56eba01f96e9015c269a6/Code/GraphMol/Descriptors/Lipinski.cpp#L107
    Taken from rdkit source to find which bonds are rotatable
    store rotatable bonds in (from_atom, to_atom)
    """
    combined = molecules[0]
    for nextmol in molecules[1:]:
      combined = rdmolops.CombineMols(combined, nextmol)
    return combined

def compute_contact_centroid(molecular_complex, cutoff=4.5):
  """Computes the (x,y,z) centroid of the contact regions of this molecular complex.

  For a molecular complex, it's necessary for various featurizations
  that compute voxel grids to find a reasonable center for the
  voxelization. This function computes the centroid of all the contact
  atoms, defined as an atom that's within `cutoff` Angstroms of an
  atom from a different molecule.

  Parameters
  ----------
  molecular_complex: Object
    A representation of a molecular complex, produced by
    `rdkit_util.load_complex`.
  cutoff: float, optional
    The distance in Angstroms considered for computing contacts.
  """
  fragments = reduce_molecular_complex_to_contacts(molecular_complex, cutoff)
  coords = [frag[0] for frag in fragments]
  contact_coords = merge_molecules_xyz(coords) 
  centroid = np.mean(contact_coords, axis=0)
  return (centroid)

def compute_ring_center(mol, ring_indices):
  """Computes 3D coordinates of a center of a given ring.

  Parameters:
  -----------
  mol: rdkit.rdchem.Mol
    Molecule containing a ring
  ring_indices: array-like
    Indices of atoms forming a ring

  Returns:
  --------
    ring_centroid: np.ndarray
      Position of a ring center
  """
  conformer = mol.GetConformer()
  ring_xyz = np.zeros((len(ring_indices), 3))
  for i, atom_idx in enumerate(ring_indices):
    atom_position = conformer.GetAtomPosition(atom_idx)
    ring_xyz[i] = np.array(atom_position)
  ring_centroid = compute_centroid(ring_xyz)
  return ring_centroid
 
def compute_ring_normal(mol, ring_indices):
  """Computes normal to a plane determined by a given ring.

  Parameters:
  -----------
  mol: rdkit.rdchem.Mol
    Molecule containing a ring
  ring_indices: array-like
    Indices of atoms forming a ring

  Returns:
  --------
  normal: np.ndarray
    Normal vector
  """
  conformer = mol.GetConformer()
  points = np.zeros((3, 3))
  for i, atom_idx in enumerate(ring_indices[:3]):
    atom_position = conformer.GetAtomPosition(atom_idx)
    points[i] = np.array(atom_position)

  v1 = points[1] - points[0]
  v2 = points[2] - points[0]
  normal = np.cross(v1, v2)
  return normal

def rotate_molecules(mol_coordinates_list):
  """Rotates provided molecular coordinates.

  Pseudocode:
  1. Generate random rotation matrix. This matrix applies a
     random transformation to any 3-vector such that, were the
     random transformation repeatedly applied, it would randomly
     sample along the surface of a sphere with radius equal to
     the norm of the given 3-vector cf.
     generate_random_rotation_matrix() for details
  2. Apply R to all atomic coordinates.
  3. Return rotated molecule

  Parameters
  ----------
  mol_coordinates_list: list
    Elements of list must be (N_atoms, 3) shaped arrays
  """
  R = generate_random_rotation_matrix()
  rotated_coordinates_list = []

  for mol_coordinates in mol_coordinates_list:
    coordinates = deepcopy(mol_coordinates)
    rotated_coordinates = np.transpose(np.dot(R, np.transpose(coordinates)))
    rotated_coordinates_list.append(rotated_coordinates)

  return (rotated_coordinates_list)

def compute_all_ecfp(mol, indices=None, degree=2):
  """Obtain molecular fragment for all atoms emanating outward to given degree.

  For each fragment, compute SMILES string (for now) and hash to
  an int. Return a dictionary mapping atom index to hashed
  SMILES.
  """

  ecfp_dict = {}
  from rdkit import Chem
    from rdkit.Chem import rdmolops
    pattern = Chem.MolFromSmarts(
        "[!$(*#*)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])("
        "[CH3])[CH3])&!$([CD3](=[N,O,S])-!@[#7,O,S!D1])&!$([#7,O,S!D1]-!@[CD3]="
        "[N,O,S])&!$([CD3](=[N+])-!@[#7!D1])&!$([#7!D1]-!@[CD3]=[N+])]-!@[!$(*#"
        "*)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])([CH3])"
        "[CH3])]")
    rdmolops.FastFindRings(self.mol)
    self.rotatable_bonds = self.mol.GetSubstructMatches(pattern)
  for i in range(mol.GetNumAtoms()):
    if indices is not None and i not in indices:
      continue
    env = Chem.FindAtomEnvironmentOfRadiusN(mol, degree, i, useHs=True)
    submol = Chem.PathToSubmol(mol, env)
    smile = Chem.MolToSmiles(submol)
    ecfp_dict[i] = "%s,%s" % (mol.GetAtoms()[i].GetAtomicNum(), smile)

  return ecfp_dict
+63 −0

File changed.

Preview size limit exceeded, changes collapsed.