Commit a3609a84 authored by nd-02110114's avatar nd-02110114
Browse files

♻️ (refactor docs)

parent 0fa97cb9
Loading
Loading
Loading
Loading
+107 −57
Original line number Diff line number Diff line
@@ -2,11 +2,9 @@
Conformer generation.
"""

__author__ = "Steven Kearnes"
__copyright__ = "Copyright 2014, Stanford University"
__license__ = "3-clause BSD"

import numpy as np
from typing import Any, List, Optional
from deepchem.utils.typing import RDKitMol


class ConformerGenerator(object):
@@ -28,6 +26,17 @@ class ConformerGenerator(object):
  .. [1] http://rdkit.org/docs/GettingStartedInPython.html#working-with-3d-molecules
  .. [2] http://pubs.acs.org/doi/full/10.1021/ci2004658
  
  Note
  ----
  This class requires RDKit to be installed.
  """

  def __init__(self,
               max_conformers: int = 1,
               rmsd_threshold: float = 0.5,
               force_field: str = 'uff',
               pool_multiplier: int = 10):
    """
    Parameters
    ----------
    max_conformers : int, optional (default 1)
@@ -44,12 +53,6 @@ class ConformerGenerator(object):
      minimization, increasing the size of the pool increases the chance
      of identifying max_conformers unique conformers.
    """

  def __init__(self,
               max_conformers=1,
               rmsd_threshold=0.5,
               force_field='uff',
               pool_multiplier=10):
    self.max_conformers = max_conformers
    if rmsd_threshold is None or rmsd_threshold < 0:
      rmsd_threshold = -1.
@@ -57,18 +60,24 @@ class ConformerGenerator(object):
    self.force_field = force_field
    self.pool_multiplier = pool_multiplier

  def __call__(self, mol):
  def __call__(self, mol: RDKitMol) -> RDKitMol:
    """
    Generate conformers for a molecule.

    Parameters
    ----------
    mol: RDKit Mol
        Molecule.
      RDKit Mol object

    Returns
    -------
    mol: RDKit Mol
      A new RDKit Mol containing the chosen conformers, sorted by
      increasing energy.
    """
    return self.generate_conformers(mol)

  def generate_conformers(self, mol):
  def generate_conformers(self, mol: RDKitMol) -> RDKitMol:
    """
    Generate conformers for a molecule.

@@ -78,7 +87,13 @@ class ConformerGenerator(object):
    Parameters
    ----------
    mol: RDKit Mol
        Molecule.
      RDKit Mol object

    Returns
    -------
    mol: RDKit Mol
      A new RDKit Mol containing the chosen conformers, sorted by
      increasing energy.
    """

    # initial embedding
@@ -98,36 +113,57 @@ class ConformerGenerator(object):

    return mol

  def embed_molecule(self, mol):
  def embed_molecule(self, mol: RDKitMol) -> RDKitMol:
    """
    Generate conformers, possibly with pruning.

    Parameters
    ----------
    mol: RDKit Mol
        Molecule.
      RDKit Mol object

    Returns
    -------
    mol: RDKit Mol
      RDKit Mol object with embedded multiple conformers.
    """
    try:
      from rdkit import Chem
      from rdkit.Chem import AllChem
    except ModuleNotFoundError:
      raise ValueError("This function requires RDKit to be installed.")

    mol = Chem.AddHs(mol)  # add hydrogens
    n_confs = self.max_conformers * self.pool_multiplier
    AllChem.EmbedMultipleConfs(mol, numConfs=n_confs, pruneRmsThresh=-1.)
    return mol

  def get_molecule_force_field(self, mol, conf_id=None, **kwargs):
  def get_molecule_force_field(self,
                               mol: RDKitMol,
                               conf_id: Optional[int] = None,
                               **kwargs) -> Any:
    """
    Get a force field for a molecule.

    Parameters
    ----------
    mol: RDKit Mol
        Molecule.
      RDKit Mol object with embedded conformers.
    conf_id : int, optional
      ID of the conformer to associate with the force field.
    kwargs : dict, optional
      Keyword arguments for force field constructor.

    Returns
    -------
    ff: rdkit.ForceField
      RDKit force field instance for a molecule.
    """
    try:
      from rdkit.Chem import AllChem
    except ModuleNotFoundError:
      raise ValueError("This function requires RDKit to be installed.")

    if self.force_field == 'uff':
      ff = AllChem.UFFGetMoleculeForceField(mol, confId=conf_id, **kwargs)
    elif self.force_field.startswith('mmff'):
@@ -141,31 +177,31 @@ class ConformerGenerator(object):
                       "'{}'.".format(self.force_field))
    return ff

  def minimize_conformers(self, mol):
  def minimize_conformers(self, mol: RDKitMol) -> None:
    """
    Minimize molecule conformers.

    Parameters
    ----------
    mol: RDKit Mol
        Molecule.
      RDKit Mol object with embedded conformers.
    """
    for conf in mol.GetConformers():
      ff = self.get_molecule_force_field(mol, conf_id=conf.GetId())
      ff.Minimize()

  def get_conformer_energies(self, mol):
  def get_conformer_energies(self, mol: RDKitMol) -> np.ndarray:
    """
    Calculate conformer energies.

    Parameters
    ----------
    mol: RDKit Mol
        Molecule.
      RDKit Mol object with embedded conformers.

    Returns
    -------
    energies : array_like
    energies : np.ndarray
      Minimized conformer energies.
    """
    energies = []
@@ -176,7 +212,7 @@ class ConformerGenerator(object):
    energies = np.asarray(energies, dtype=float)
    return energies

  def prune_conformers(self, mol):
  def prune_conformers(self, mol: RDKitMol) -> RDKitMol:
    """
    Prune conformers from a molecule using an RMSD threshold, starting
    with the lowest energy conformer.
@@ -184,20 +220,26 @@ class ConformerGenerator(object):
    Parameters
    ----------
    mol: RDKit Mol
        Molecule.
      RDKit Mol object

    Returns
    -------
    new_mol: RDKit Mol
      A new RDKit Mol containing the chosen conformers, sorted by
      increasing energy.
    """
    try:
      from rdkit import Chem
    except ModuleNotFoundError:
      raise ValueError("This function requires RDKit to be installed.")

    if self.rmsd_threshold < 0 or mol.GetNumConformers() <= 1:
      return mol
    energies = self.get_conformer_energies(mol)
    rmsd = self.get_conformer_rmsd(mol)

    sort = np.argsort(energies)  # sort by increasing energy
    keep = []  # always keep lowest-energy conformer
    keep: List[float] = []  # always keep lowest-energy conformer
    discard = []
    for i in sort:
      # always keep lowest-energy conformer
@@ -221,26 +263,34 @@ class ConformerGenerator(object):

    # create a new molecule to hold the chosen conformers
    # this ensures proper conformer IDs and energy-based ordering
    from rdkit import Chem
    new = Chem.Mol(mol)
    new.RemoveAllConformers()
    new_mol = Chem.Mol(mol)
    new_mol.RemoveAllConformers()
    conf_ids = [conf.GetId() for conf in mol.GetConformers()]
    for i in keep:
      conf = mol.GetConformer(conf_ids[i])
      new.AddConformer(conf, assignId=True)
    return new
      new_mol.AddConformer(conf, assignId=True)
    return new_mol

  @staticmethod
  def get_conformer_rmsd(mol):
  def get_conformer_rmsd(mol: RDKitMol) -> np.ndarray:
    """
    Calculate conformer-conformer RMSD.

    Parameters
    ----------
    mol: RDKit Mol
        Molecule.
      RDKit Mol object

    Returns
    -------
    rmsd: np.ndarray
      A conformer-conformer RMSD value. The shape is `(NumConformers, NumConformers)`
    """
    try:
      from rdkit.Chem import AllChem
    except ModuleNotFoundError:
      raise ValueError("This function requires RDKit to be installed.")

    rmsd = np.zeros(
        (mol.GetNumConformers(), mol.GetNumConformers()), dtype=float)
    for i, ref_conf in enumerate(mol.GetConformers()):
+143 −118
Original line number Diff line number Diff line
"""A collection of utilities for dealing with Molecular Fragments"""
import itertools
import numpy as np
from typing import List, Optional, Any
from typing import Any, List, Iterable, Optional, Sequence, Set, Tuple, Union

from deepchem.utils.typing import RDKitAtom, RDKitMol
from deepchem.utils.geometry_utils import compute_pairwise_distances
from deepchem.utils.rdkit_utils import compute_charges


def get_partial_charge(atom):
  """Get partial charge of a given atom (rdkit Atom object)
class AtomShim(object):
  """This is a shim object wrapping an atom.

  We use this class instead of raw RDKit atoms since manipulating a
  large number of rdkit Atoms seems to result in segfaults. Wrapping
  the basic information in an AtomShim seems to avoid issues.
  """

  def __init__(self, atomic_num: int, partial_charge: float,
               atom_coords: np.ndarray):
    """Initialize this object

    Parameters
    ----------
  atom: rdkit atom or `AtomShim` object
    Either an rdkit atom or `AtomShim`
    atomic_num: int
      Atomic number for this atom.
    partial_charge: float
      The partial Gasteiger charge for this atom
    atom_coords: np.ndarray
      Of shape (3,) with the coordinates of this atom
    """
    self.atomic_num = atomic_num
    self.partial_charge = partial_charge
    self.coords = atom_coords

  Note
  ----
  This function requires RDKit to be installed.
  def GetAtomicNum(self) -> int:
    """Returns atomic number for this atom.

  Examples
  --------
  >>> from rdkit import Chem
  >>> mol = Chem.MolFromSmiles("CC")
  >>> atom = mol.GetAtoms()[0]
  >>> get_partial_charge(atom)
  0
    Returns
    -------
    int
      Atomic number for this atom.
    """
  from rdkit import Chem
  if isinstance(atom, Chem.Atom):
    try:
      value = atom.GetProp(str("_GasteigerCharge"))
      if value == '-nan':
        return 0
      return float(value)
    except KeyError:
      return 0
  else:
    return atom.GetPartialCharge()
    return self.atomic_num

  def GetPartialCharge(self) -> float:
    """Returns partial charge for this atom.

    Returns
    -------
    float
      A partial Gasteiger charge for this atom.
    """
    return self.partial_charge

  def GetCoords(self) -> np.ndarray:
    """Returns 3D coordinates for this atom as numpy array.

    Returns
    -------
    np.ndarray
      Numpy array of shape `(3,)` with coordinates for this atom.
    """
    return self.coords


class MolecularFragment(object):
@@ -60,13 +85,13 @@ class MolecularFragment(object):
  >>> fragment = MolecularFragment([atom], coords)
  """

  def __init__(self, atoms, coords):
  def __init__(self, atoms: Sequence[RDKitAtom], coords: np.ndarray):
    """Initialize this object.

    Parameters
    ----------
    atoms: list
      Each entry in this list should be an RdkitAtom
    atoms: Iterable[RDKit Atom]
      Each entry in this list should be a RDKit Atom.
    coords: np.ndarray
      Array of locations for atoms of shape `(N, 3)` where `N ==
      len(atoms)`.
@@ -83,77 +108,68 @@ class MolecularFragment(object):
    ]
    self.coords = coords

  def GetAtoms(self):
  def GetAtoms(self) -> List[AtomShim]:
    """Returns the list of atoms

    Returns
    -------
    List[AtomShim]
      list of atoms in this fragment.
    """
    return self.atoms

  def GetCoords(self):
  def GetCoords(self) -> np.ndarray:
    """Returns 3D coordinates for this fragment as numpy array.

    Returns
    -------
    Numpy array of shape `(N, 3)` with coordinates for this fragment.
    Here `N == len(self.GetAtoms())`.
    np.ndarray
      A numpy array of shape `(N, 3)` with coordinates for this fragment.
      Here, N is the number of atoms.
    """
    return self.coords


class AtomShim(object):
  """This is a shim object wrapping an atom.

  We use this class instead of raw RDKit atoms since manipulating a
  large number of rdkit Atoms seems to result in segfaults. Wrapping
  the basic information in an AtomShim seems to avoid issues.
  """

  def __init__(self, atomic_num: int, partial_charge: float,
               atom_coords: np.ndarray):
    """Initialize this object
def get_partial_charge(atom: Union[RDKitAtom, AtomShim]) -> float:
  """Get partial charge of a given atom (rdkit Atom object)

  Parameters
  ----------
    atomic_num: int
      Atomic number for this atom.
    partial_charge: float
      The partial Gasteiger charge for this atom
    atom_coords: np.ndarray
      Of shape (3,) with the coordinates of this atom
    """
    self.atomic_num = atomic_num
    self.partial_charge = partial_charge
    self.coords = atom_coords

  def GetAtomicNum(self) -> int:
    """Returns atomic number for this atom.
  atom: RDKit Atom or AtomShim
    Either a rdkit.Atom object or `AtomShim`

  Returns
  -------
    Atomic number for this atom.
    """
    return self.atomic_num
  float
    A partial Gasteiger charge of a given atom.

  def GetPartialCharge(self) -> float:
    """Returns partial charge for this atom.
  Note
  ----
  This function requires RDKit to be installed.

    Returns
    -------
    Partial Gasteiger charge for this atom.
  Examples
  --------
  >>> from rdkit import Chem
  >>> mol = Chem.MolFromSmiles("CC")
  >>> atom = mol.GetAtoms()[0]
  >>> get_partial_charge(atom)
  0.0
  """
    return self.partial_charge

  def GetCoords(self) -> np.ndarray:
    """Returns 3D coordinates for this atom as numpy array.
  try:
    from rdkit import Chem
  except ModuleNotFoundError:
    raise ValueError("This function requires RDKit to be installed.")

    Returns
    -------
    Numpy array of shape `(3,)` with coordinates for this atom.
    """
    return self.coords
  if isinstance(atom, Chem.Atom):
    try:
      value = atom.GetProp(str("_GasteigerCharge"))
      if value == '-nan':
        return 0.0
      return float(value)
    except KeyError:
      return 0.0
  else:
    return atom.GetPartialCharge()


def merge_molecular_fragments(
@@ -162,11 +178,12 @@ def merge_molecular_fragments(

  Parameters
  ----------
  molecules: list
  molecules: List[MolecularFragment]
    List of `MolecularFragment` objects. 

  Returns
  -------
  Optional[MolecularFragment]
    Returns a merged `MolecularFragment`
  """
  if len(molecules) == 0:
@@ -183,15 +200,16 @@ def merge_molecular_fragments(
    return MolecularFragment(all_atoms, all_coords)


def get_mol_subset(coords: np.ndarray, mol,
                   atom_indices_to_keep: List[int]) -> MolecularFragment:
def get_mol_subset(
    coords: np.ndarray, mol: Union[RDKitMol, MolecularFragment],
    atom_indices_to_keep: List[int]) -> Tuple[np.ndarray, MolecularFragment]:
  """Strip a subset of the atoms in this molecule

  Parameters
  ----------
  coords: Numpy ndarray
  coords: np.ndarray
    Must be of shape (N, 3) and correspond to coordinates of mol.
  mol: Rdkit mol or `MolecularFragment`
  mol: RDKit Mol or MolecularFragment
    The molecule to strip
  atom_indices_to_keep: list
    List of the indices of the atoms to keep. Each index is a unique
@@ -199,7 +217,9 @@ def get_mol_subset(coords: np.ndarray, mol,

  Returns
  -------
  Returns a `MolecularFragment` that summarizes the subset to be returned.
  Tuple[np.ndarray, MolecularFragment]
    A tuple of `(coords, mol_frag)` where `coords` is a numpy array of
    coordinates with hydrogen coordinates. `mol_frag` is a `MolecularFragment`.

  Note
  ----
@@ -209,9 +229,10 @@ def get_mol_subset(coords: np.ndarray, mol,
    from rdkit import Chem
  except ModuleNotFoundError:
    raise ValueError("This function requires RDKit to be installed.")

  indexes_to_keep = []
  atoms_to_keep = []
  # Compute partial charges on molecule if rdkit
  # Compute partial charges on molecule if RDKit Mol
  if isinstance(mol, Chem.Mol):
    compute_charges(mol)
  atoms = list(mol.GetAtoms())
@@ -220,24 +241,25 @@ def get_mol_subset(coords: np.ndarray, mol,
    atoms_to_keep.append(atoms[index])
  coords = coords[indexes_to_keep]
  mol_frag = MolecularFragment(atoms_to_keep, coords)
  return mol_frag
  return coords, mol_frag


def strip_hydrogens(coords: np.ndarray, mol) -> MolecularFragment:
def strip_hydrogens(coords: np.ndarray, mol: Union[RDKitMol, MolecularFragment]
                   ) -> Tuple[np.ndarray, MolecularFragment]:
  """Strip the hydrogens from input molecule

  Parameters
  ----------
  coords: Numpy ndarray
    Must be of shape (N, 3) and correspond to coordinates of mol.
  mol: Rdkit mol or `MolecularFragment`
  coords: np.ndarray
    The coords must be of shape (N, 3) and correspond to coordinates of mol.
  mol: RDKit Mol or MolecularFragment
    The molecule to strip

  Returns
  -------
  A tuple of (coords, mol_frag) where coords is a Numpy array of
  coordinates with hydrogen coordinates. mol_frag is a
  `MolecularFragment`. 
  Tuple[np.ndarray, MolecularFragment]
    A tuple of `(coords, mol_frag)` where `coords` is a numpy array of
    coordinates with hydrogen coordinates. `mol_frag` is a `MolecularFragment`.

  Note
  ----
@@ -252,8 +274,8 @@ def strip_hydrogens(coords: np.ndarray, mol) -> MolecularFragment:
  return get_mol_subset(coords, mol, atom_indices_to_keep)


def get_contact_atom_indices(fragments: List[Any],
                             cutoff: float = 4.5) -> List[Any]:
def get_contact_atom_indices(fragments: List[Tuple[np.ndarray, RDKitMol]],
                             cutoff: float = 4.5) -> List[List[int]]:
  """Compute that atoms close to contact region.

  Molecular complexes can get very large. This can make it unwieldy to
@@ -266,21 +288,22 @@ def get_contact_atom_indices(fragments: List[Any],

  Parameters
  ----------
  fragments: List
  fragments: List[Tuple[np.ndarray, RDKit Mol]]
    As returned by `rdkit_utils.load_complex`, a list of tuples of
    `(coords, mol)` where `coords` is a `(N_atoms, 3)` array and `mol`
    is the rdkit molecule object.
  cutoff: float
  cutoff: float, optional (default 4.5)
    The cutoff distance in angstroms.

  Returns
  -------
  List[List[int]]
    A list of length `len(molecular_complex)`. Each entry in this list
    is a list of atom indices from that molecule which should be kept, in
    sorted order.
  """
  # indices to atoms to keep
  keep_inds: List[Any] = [set([]) for _ in fragments]
  keep_inds: List[Set[int]] = [set([]) for _ in fragments]
  for (ind1, ind2) in itertools.combinations(range(len(fragments)), 2):
    frag1, frag2 = fragments[ind1], fragments[ind2]
    pairwise_distances = compute_pairwise_distances(frag1[0], frag2[0])
@@ -293,12 +316,13 @@ def get_contact_atom_indices(fragments: List[Any],
    frag2_atoms = set([int(c) for c in contacts[1].tolist()])
    keep_inds[ind1] = keep_inds[ind1].union(frag1_atoms)
    keep_inds[ind2] = keep_inds[ind2].union(frag2_atoms)
  keep_inds = [sorted(list(keep)) for keep in keep_inds]
  return keep_inds
  sorted_keep_inds = [sorted(list(keep)) for keep in keep_inds]
  return sorted_keep_inds


def reduce_molecular_complex_to_contacts(fragments: List,
                                         cutoff: float = 4.5) -> List:
def reduce_molecular_complex_to_contacts(
    fragments: List[Tuple[np.ndarray, RDKitMol]],
    cutoff: float = 4.5) -> List[Tuple[np.ndarray, MolecularFragment]]:
  """Reduce a molecular complex to only those atoms near a contact.

  Molecular complexes can get very large. This can make it unwieldy to
@@ -311,7 +335,7 @@ def reduce_molecular_complex_to_contacts(fragments: List,

  Parameters
  ----------
  fragments: List
  fragments: List[Tuple[np.ndarray, RDKitMol]]
    As returned by `rdkit_utils.load_complex`, a list of tuples of
    `(coords, mol)` where `coords` is a `(N_atoms, 3)` array and `mol`
    is the rdkit molecule object.
@@ -320,6 +344,7 @@ def reduce_molecular_complex_to_contacts(fragments: List,

  Returns
  -------
  List[Tuple[np.ndarray, MolecularFragment]]
    A list of length `len(molecular_complex)`. Each entry in this list
    is a tuple of `(coords, MolecularFragment)`. The coords is stripped
    down to `(N_contact_atoms, 3)` where `N_contact_atoms` is the number
+1 −0
Original line number Diff line number Diff line
@@ -18,3 +18,4 @@ Shape = Tuple[int, ...]

# type of RDKit Mol object
RDKitMol = Any
RDKitAtom = Any
+1 −1

File changed.

Contains only whitespace changes.