Unverified Commit 2331a1fe authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #2212 from deepchem/interaction

Interaction Fingerprint Addition
parents ff137c65 35288b8f
Loading
Loading
Loading
Loading

datasets/.gitignore

0 → 100644
+11 −0
Original line number Diff line number Diff line
PPB.csv
SAMPL.csv
bace.csv
bace_c-featurized/
chembl-featurized/
clintox-featurized/
clintox.csv.gz
core_grid.json
ppb-featurized/
sampl-featurized/
atom_init.json
+10 −0
Original line number Diff line number Diff line
@@ -37,6 +37,16 @@ from deepchem.feat.complex_featurizers import RdkitGridFeaturizer
from deepchem.feat.complex_featurizers import NeighborListAtomicCoordinates
from deepchem.feat.complex_featurizers import NeighborListComplexAtomicCoordinates
from deepchem.feat.complex_featurizers import ComplexNeighborListFragmentAtomicCoordinates
from deepchem.feat.complex_featurizers import ContactCircularFingerprint
from deepchem.feat.complex_featurizers import ContactCircularVoxelizer
from deepchem.feat.complex_featurizers import SplifFingerprint
from deepchem.feat.complex_featurizers import SplifVoxelizer
from deepchem.feat.complex_featurizers import ChargeVoxelizer
from deepchem.feat.complex_featurizers import SaltBridgeVoxelizer
from deepchem.feat.complex_featurizers import CationPiVoxelizer
from deepchem.feat.complex_featurizers import PiStackVoxelizer
from deepchem.feat.complex_featurizers import HydrogenBondVoxelizer
from deepchem.feat.complex_featurizers import HydrogenBondCounter

# material featurizers
from deepchem.feat.material_featurizers import ElementPropertyFingerprint
+10 −0
Original line number Diff line number Diff line
@@ -6,3 +6,13 @@ from deepchem.feat.complex_featurizers.rdkit_grid_featurizer import RdkitGridFea
from deepchem.feat.complex_featurizers.complex_atomic_coordinates import NeighborListAtomicCoordinates
from deepchem.feat.complex_featurizers.complex_atomic_coordinates import NeighborListComplexAtomicCoordinates
from deepchem.feat.complex_featurizers.complex_atomic_coordinates import ComplexNeighborListFragmentAtomicCoordinates
from deepchem.feat.complex_featurizers.contact_fingerprints import ContactCircularFingerprint
from deepchem.feat.complex_featurizers.contact_fingerprints import ContactCircularVoxelizer
from deepchem.feat.complex_featurizers.grid_featurizers import ChargeVoxelizer
from deepchem.feat.complex_featurizers.grid_featurizers import SaltBridgeVoxelizer
from deepchem.feat.complex_featurizers.grid_featurizers import CationPiVoxelizer
from deepchem.feat.complex_featurizers.grid_featurizers import PiStackVoxelizer
from deepchem.feat.complex_featurizers.grid_featurizers import HydrogenBondVoxelizer
from deepchem.feat.complex_featurizers.grid_featurizers import HydrogenBondCounter
from deepchem.feat.complex_featurizers.splif_fingerprints import SplifFingerprint
from deepchem.feat.complex_featurizers.splif_fingerprints import SplifVoxelizer
+235 −0
Original line number Diff line number Diff line
"""
Topological fingerprints for macromolecular structures.
"""
import numpy as np
import logging
import itertools
from deepchem.utils.hash_utils import hash_ecfp
from deepchem.feat import ComplexFeaturizer
from deepchem.utils.rdkit_utils import load_complex
from deepchem.utils.hash_utils import vectorize
from deepchem.utils.voxel_utils import voxelize
from deepchem.utils.voxel_utils import convert_atom_to_voxel
from deepchem.utils.rdkit_utils import compute_all_ecfp
from deepchem.utils.rdkit_utils import compute_contact_centroid
from deepchem.utils.rdkit_utils import MoleculeLoadException
from deepchem.utils.geometry_utils import compute_pairwise_distances
from deepchem.utils.geometry_utils import subtract_centroid

from typing import Tuple, Dict, List

logger = logging.getLogger(__name__)


def featurize_contacts_ecfp(
    frag1: Tuple,
    frag2: Tuple,
    pairwise_distances: np.ndarray = None,
    cutoff: float = 4.5,
    ecfp_degree: int = 2) -> Tuple[Dict[int, str], Dict[int, str]]:
  """Computes ECFP dicts for pairwise interaction between two molecular fragments.

  Parameters
  ----------
  frag1: Tuple
    A tuple of (coords, mol) returned by `load_molecule`.
  frag2: Tuple
    A tuple of (coords, mol) returned by `load_molecule`.
  pairwise_distances: np.ndarray
    Array of pairwise fragment-fragment distances (Angstroms)
  cutoff: float
    Cutoff distance for contact consideration
  ecfp_degree: int
    ECFP radius

  Returns
  -------
  Tuple of dictionaries of ECFP contact fragments
  """
  if pairwise_distances is None:
    pairwise_distances = compute_pairwise_distances(frag1[0], frag2[0])
  # contacts is of form (x_coords, y_coords), a tuple of 2 lists
  contacts = np.nonzero((pairwise_distances < cutoff))
  # contacts[0] is the x_coords, that is the frag1 atoms that have
  # nonzero contact.
  frag1_atoms = set([int(c) for c in contacts[0].tolist()])
  # contacts[1] is the y_coords, the frag2 atoms with nonzero contacts
  frag2_atoms = set([int(c) for c in contacts[1].tolist()])

  frag1_ecfp_dict = compute_all_ecfp(
      frag1[1], indices=frag1_atoms, degree=ecfp_degree)
  frag2_ecfp_dict = compute_all_ecfp(
      frag2[1], indices=frag2_atoms, degree=ecfp_degree)

  return (frag1_ecfp_dict, frag2_ecfp_dict)


class ContactCircularFingerprint(ComplexFeaturizer):
  """Compute (Morgan) fingerprints near contact points of macromolecular complexes.

  Given a macromolecular complex made up of multiple
  constituent molecules, first compute the contact points where
  atoms from different molecules come close to one another. For
  atoms within "contact regions," compute radial "ECFP"
  fragments which are sub-molecules centered at atoms in the
  contact region.

  For a macromolecular complex, returns a vector of shape
  `(2*size,)`
  """

  def __init__(self, cutoff: float = 4.5, radius: int = 2, size: int = 8):
    """
    Parameters
    ----------
    cutoff: float (default 4.5)
      Distance cutoff in angstroms for molecules in complex.
    radius: int, optional (default 2)
      Fingerprint radius.
    size: int, optional (default 8)
      Length of generated bit vector.
    """
    self.cutoff = cutoff
    self.radius = radius
    self.size = size

  def _featurize(self, mol_pdb: str, protein_pdb: str):
    """
    Compute featurization for a molecular complex

    Parameters
    ----------
    mol_pdb: str
      Filename for ligand molecule
    protein_pdb: str
      Filename for protein molecule
    """
    try:
      fragments = load_complex((mol_pdb, protein_pdb), add_hydrogens=False)

    except MoleculeLoadException:
      logger.warning("This molecule cannot be loaded by Rdkit. Returning None")
      return None
    pairwise_features = []
    # We compute pairwise contact fingerprints
    for (frag1, frag2) in itertools.combinations(fragments, 2):
      # Get coordinates
      distances = compute_pairwise_distances(frag1[0], frag2[0])
      vector = [
          vectorize(hash_ecfp, feature_dict=ecfp_dict, size=self.size)
          for ecfp_dict in featurize_contacts_ecfp(
              frag1,
              frag2,
              distances,
              cutoff=self.cutoff,
              ecfp_degree=self.radius)
      ]
      pairwise_features += vector

    pairwise_features = np.concatenate(pairwise_features)
    return pairwise_features


class ContactCircularVoxelizer(ComplexFeaturizer):
  """Computes ECFP fingerprints on a voxel grid.

  Given a macromolecular complex made up of multiple
  constituent molecules, first compute the contact points where
  atoms from different molecules come close to one another. For
  atoms within "contact regions," compute radial "ECFP"
  fragments which are sub-molecules centered at atoms in the
  contact region. Localize these ECFP fingeprints at the voxel
  in which they originated.

  Featurizes a macromolecular complex into a tensor of shape
  `(voxels_per_edge, voxels_per_edge, voxels_per_edge, size)` where
  `voxels_per_edge = int(box_width/voxel_width)`. If `flatten==True`,
  then returns a flattened version of this tensor of length
  `size*voxels_per_edge**3`
  """

  def __init__(self,
               cutoff: float = 4.5,
               radius: int = 2,
               size: int = 8,
               box_width: float = 16.0,
               voxel_width: float = 1.0,
               flatten: bool = False):
    """
    Parameters
    ----------
    cutoff: float (default 4.5)
      Distance cutoff in angstroms for molecules in complex.
    radius : int, optional (default 2)
      Fingerprint radius.
    size : int, optional (default 8)
      Length of generated bit vector.
    box_width: float, optional (default 16.0)
      Size of a box in which voxel features are calculated. Box
      is centered on a ligand centroid.
    voxel_width: float, optional (default 1.0)
      Size of a 3D voxel in a grid.
    flatten: bool, optional (default False)
      If True, then returns a flat feature vector rather than voxel grid. This
      feature vector is constructed by flattening the usual voxel grid.
    """
    self.cutoff = cutoff
    self.radius = radius
    self.size = size
    self.box_width = box_width
    self.voxel_width = voxel_width
    self.voxels_per_edge = int(self.box_width / self.voxel_width)
    self.flatten = flatten

  def _featurize(self, mol_pdb: str, protein_pdb: str):
    """
    Compute featurization for a molecular complex

    Parameters
    ----------
    mol_pdb: str
      Filename for ligand molecule
    protein_pdb: str
      Filename for protein molecule
    """
    molecular_complex = (mol_pdb, protein_pdb)
    try:
      fragments = load_complex(molecular_complex, add_hydrogens=False)

    except MoleculeLoadException:
      logger.warning("This molecule cannot be loaded by Rdkit. Returning None")
      return None
    pairwise_features: List[np.ndarray] = []
    # We compute pairwise contact fingerprints
    centroid = compute_contact_centroid(fragments, cutoff=self.cutoff)
    for (frag1, frag2) in itertools.combinations(fragments, 2):
      distances = compute_pairwise_distances(frag1[0], frag2[0])
      frag1_xyz = subtract_centroid(frag1[0], centroid)
      frag2_xyz = subtract_centroid(frag2[0], centroid)
      xyzs = [frag1_xyz, frag2_xyz]
      pairwise_features.append(
          sum([
              voxelize(
                  convert_atom_to_voxel,
                  xyz,
                  self.box_width,
                  self.voxel_width,
                  hash_function=hash_ecfp,
                  feature_dict=ecfp_dict,
                  nb_channel=self.size) for xyz, ecfp_dict in zip(
                      xyzs,
                      featurize_contacts_ecfp(
                          frag1,
                          frag2,
                          distances,
                          cutoff=self.cutoff,
                          ecfp_degree=self.radius))
          ]))
    if self.flatten:
      return np.concatenate(
          [features.flatten() for features in pairwise_features])
    else:
      # Features are of shape (voxels_per_edge, voxels_per_edge,
      # voxels_per_edge, num_feat) so we should concatenate on the last
      # axis.
      return np.concatenate(pairwise_features, axis=-1)
+633 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading