Commit 5bd91993 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Remove more files

parent 94885587
Loading
Loading
Loading
Loading
+0 −168
Original line number Diff line number Diff line
from collections import deque

import sys
import tensorflow as tf
import pickle

import os
import fnmatch
import numpy as np
from scipy.spatial.distance import pdist, squareform
import pandas as pd

from deepchem.feat.base_classes import Featurizer
from deepchem.feat.graph_features import atom_features
from scipy.sparse import csr_matrix


def get_atom_type(atom):
  elem = atom.GetAtomicNum()
  hyb = str(atom.GetHybridization).lower()
  if elem == 1:
    return (0)
  if elem == 4:
    return (1)
  if elem == 5:
    return (2)
  if elem == 6:
    if "sp2" in hyb:
      return (3)
    elif "sp3" in hyb:
      return (4)
    else:
      return (5)
  if elem == 7:
    if "sp2" in hyb:
      return (6)
    elif "sp3" in hyb:
      return (7)
    else:
      return (8)
  if elem == 8:
    if "sp2" in hyb:
      return (9)
    elif "sp3" in hyb:
      return (10)
    else:
      return (11)
  if elem == 9:
    return (12)
  if elem == 15:
    if "sp2" in hyb:
      return (13)
    elif "sp3" in hyb:
      return (14)
    else:
      return (15)
  if elem == 16:
    if "sp2" in hyb:
      return (16)
    elif "sp3" in hyb:
      return (17)
    else:
      return (18)
  if elem == 17:
    return (19)
  if elem == 35:
    return (20)
  if elem == 53:
    return (21)
  return (22)


def get_atom_adj_matrices(mol,
                          n_atom_types,
                          max_n_atoms=200,
                          max_valence=4,
                          graph_conv_features=True,
                          nxn=True):
  if not graph_conv_features:
    bond_matrix = np.zeros((max_n_atoms, 4 * max_valence)).astype(np.uint8)

  if nxn:
    adj_matrix = np.zeros((max_n_atoms, max_n_atoms)).astype(np.uint8)
  else:
    adj_matrix = np.zeros((max_n_atoms, max_valence)).astype(np.uint8)
    adj_matrix += (adj_matrix.shape[0] - 1)

  if not graph_conv_features:
    atom_matrix = np.zeros((max_n_atoms, n_atom_types + 3)).astype(np.uint8)
    atom_matrix[:, atom_matrix.shape[1] - 1] = 1

  atom_arrays = []
  for a_idx in range(0, mol.GetNumAtoms()):
    atom = mol.GetAtomWithIdx(a_idx)
    if graph_conv_features:
      atom_arrays.append(atom_features(atom))
    else:

      atom_type = get_atom_type(atom)
      atom_matrix[a_idx][-1] = 0
      atom_matrix[a_idx][atom_type] = 1

    for n_idx, neighbor in enumerate(atom.GetNeighbors()):
      if nxn:
        adj_matrix[a_idx][neighbor.GetIdx()] = 1
        adj_matrix[a_idx][a_idx] = 1
      else:
        adj_matrix[a_idx][n_idx] = neighbor.GetIdx()

      if not graph_conv_features:
        bond = mol.GetBondBetweenAtoms(a_idx, neighbor.GetIdx())
        bond_type = str(bond.GetBondType()).lower()
        if "single" in bond_type:
          bond_order = 0
        elif "double" in bond_type:
          bond_order = 1
        elif "triple" in bond_type:
          bond_order = 2
        elif "aromatic" in bond_type:
          bond_order = 3
        bond_matrix[a_idx][(4 * n_idx) + bond_order] = 1

  if graph_conv_features:
    n_feat = len(atom_arrays[0])
    atom_matrix = np.zeros((max_n_atoms, n_feat)).astype(np.uint8)
    for idx, atom_array in enumerate(atom_arrays):
      atom_matrix[idx, :] = atom_array
  else:
    atom_matrix = np.concatenate(
        [atom_matrix, bond_matrix], axis=1).astype(np.uint8)

  return (adj_matrix.astype(np.uint8), atom_matrix.astype(np.uint8))


def featurize_mol(mol, n_atom_types, max_n_atoms, max_valence,
                  num_atoms_feature):
  adj_matrix, atom_matrix = get_atom_adj_matrices(mol, n_atom_types,
                                                  max_n_atoms, max_valence)
  if num_atoms_feature:
    return ((adj_matrix, atom_matrix, mol.GetNumAtoms()))
  return ((adj_matrix, atom_matrix))


class AdjacencyFingerprint(Featurizer):

  def __init__(self,
               n_atom_types=23,
               max_n_atoms=200,
               add_hydrogens=False,
               max_valence=4,
               num_atoms_feature=False):
    self.n_atom_types = n_atom_types
    self.max_n_atoms = max_n_atoms
    self.add_hydrogens = add_hydrogens
    self.max_valence = max_valence
    self.num_atoms_feature = num_atoms_feature

  def featurize(self, rdkit_mols):
    featurized_mols = np.empty((len(rdkit_mols)), dtype=object)

    from rdkit import Chem
    for idx, mol in enumerate(rdkit_mols):
      if self.add_hydrogens:
        mol = Chem.AddHs(mol)
      featurized_mol = featurize_mol(mol, self.n_atom_types, self.max_n_atoms,
                                     self.max_valence, self.num_atoms_feature)
      featurized_mols[idx] = featurized_mol
    return (featurized_mols)
+0 −305
Original line number Diff line number Diff line
"""
Atomic coordinate featurizer.
"""
__author__ = "Joseph Gomes and Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import logging
import numpy as np
from deepchem.utils.save import log
from deepchem.feat import Featurizer
from deepchem.feat import ComplexFeaturizer
from deepchem.utils import rdkit_util, pad_array
from deepchem.utils.rdkit_util import MoleculeLoadException


class AtomicCoordinates(Featurizer):
  """
  Nx3 matrix of Cartesian coordinates [Angstrom]
  """
  name = ['atomic_coordinates']

  def _featurize(self, mol):
    """
    Calculate atomic coodinates.

    Parameters
    ----------
    mol : RDKit Mol
          Molecule.
    """

    N = mol.GetNumAtoms()
    coords = np.zeros((N, 3))

    # RDKit stores atomic coordinates in Angstrom. Atomic unit of length is the
    # bohr (1 bohr = 0.529177 Angstrom). Converting units makes gradient calculation
    # consistent with most QM software packages.
    coords_in_bohr = [
        mol.GetConformer(0).GetAtomPosition(i).__idiv__(0.52917721092)
        for i in range(N)
    ]

    for atom in range(N):
      coords[atom, 0] = coords_in_bohr[atom].x
      coords[atom, 1] = coords_in_bohr[atom].y
      coords[atom, 2] = coords_in_bohr[atom].z

    coords = [coords]
    return coords


def compute_neighbor_list(coords, neighbor_cutoff, max_num_neighbors,
                          periodic_box_size):
  """Computes a neighbor list from atom coordinates."""
  N = coords.shape[0]
  import mdtraj
  traj = mdtraj.Trajectory(coords.reshape((1, N, 3)), None)
  box_size = None
  if periodic_box_size is not None:
    box_size = np.array(periodic_box_size)
    traj.unitcell_vectors = np.array(
        [[[box_size[0], 0, 0], [0, box_size[1], 0], [0, 0, box_size[2]]]],
        dtype=np.float32)
  neighbors = mdtraj.geometry.compute_neighborlist(traj, neighbor_cutoff)
  neighbor_list = {}
  for i in range(N):
    if max_num_neighbors is not None and len(neighbors[i]) > max_num_neighbors:
      delta = coords[i] - coords.take(neighbors[i], axis=0)
      if box_size is not None:
        delta -= np.round(delta / box_size) * box_size
      dist = np.linalg.norm(delta, axis=1)
      sorted_neighbors = list(zip(dist, neighbors[i]))
      sorted_neighbors.sort()
      neighbor_list[i] = [
          sorted_neighbors[j][1] for j in range(max_num_neighbors)
      ]
    else:
      neighbor_list[i] = list(neighbors[i])
  return neighbor_list


def get_coords(mol):
  """
  Gets coordinates in Angstrom for RDKit mol.
  """
  N = mol.GetNumAtoms()
  coords = np.zeros((N, 3))

  coords_raw = [mol.GetConformer(0).GetAtomPosition(i) for i in range(N)]
  for atom in range(N):
    coords[atom, 0] = coords_raw[atom].x
    coords[atom, 1] = coords_raw[atom].y
    coords[atom, 2] = coords_raw[atom].z
  return coords


class NeighborListAtomicCoordinates(Featurizer):
  """
  Adjacency List of neighbors in 3-space

  Neighbors determined by user-defined distance cutoff [in Angstrom].

  https://en.wikipedia.org/wiki/Cell_list
  Ref: http://www.cs.cornell.edu/ron/references/1989/Calculations%20of%20a%20List%20of%20Neighbors%20in%20Molecular%20Dynamics%20Si.pdf

  Parameters
  ----------
  neighbor_cutoff: float
    Threshold distance [Angstroms] for counting neighbors.
  periodic_box_size: 3 element array
    Dimensions of the periodic box in Angstroms, or None to not use periodic boundary conditions
  """

  def __init__(self,
               max_num_neighbors=None,
               neighbor_cutoff=4,
               periodic_box_size=None):
    if neighbor_cutoff <= 0:
      raise ValueError("neighbor_cutoff must be positive value.")
    if max_num_neighbors is not None:
      if not isinstance(max_num_neighbors, int) or max_num_neighbors <= 0:
        raise ValueError("max_num_neighbors must be positive integer.")
    self.max_num_neighbors = max_num_neighbors
    self.neighbor_cutoff = neighbor_cutoff
    self.periodic_box_size = periodic_box_size
    # Type of data created by this featurizer
    self.dtype = object
    self.coordinates_featurizer = AtomicCoordinates()

  def _featurize(self, mol):
    """
    Compute neighbor list.

    Parameters
    ----------
      mol: rdkit Mol
        To be featurized.
    """
    N = mol.GetNumAtoms()
    # TODO(rbharath): Should this return a list?
    bohr_coords = self.coordinates_featurizer._featurize(mol)[0]
    coords = get_coords(mol)
    neighbor_list = compute_neighbor_list(coords, self.neighbor_cutoff,
                                          self.max_num_neighbors,
                                          self.periodic_box_size)
    return (bohr_coords, neighbor_list)


class NeighborListComplexAtomicCoordinates(ComplexFeaturizer):
  """
  Adjacency list of neighbors for protein-ligand complexes in 3-space.

  Neighbors dtermined by user-dfined distance cutoff.
  """

  def __init__(self, max_num_neighbors=None, neighbor_cutoff=4):
    if neighbor_cutoff <= 0:
      raise ValueError("neighbor_cutoff must be positive value.")
    if max_num_neighbors is not None:
      if not isinstance(max_num_neighbors, int) or max_num_neighbors <= 0:
        raise ValueError("max_num_neighbors must be positive integer.")
    self.max_num_neighbors = max_num_neighbors
    self.neighbor_cutoff = neighbor_cutoff
    # Type of data created by this featurizer
    self.dtype = object
    self.coordinates_featurizer = AtomicCoordinates()

  def _featurize_complex(self, mol_pdb_file, protein_pdb_file):
    """
    Compute neighbor list for complex.

    Parameters
    ----------
    mol_pdb_file: Str 
      Filename for ligand pdb file. 
    protein_pdb_file: Str 
      Filename for protein pdb file. 
    """
    mol_coords, ob_mol = rdkit_util.load_molecule(mol_pdb_file)
    protein_coords, protein_mol = rdkit_util.load_molecule(protein_pdb_file)
    system_coords = rdkit_util.merge_molecules_xyz(mol_coords, protein_coords)

    system_neighbor_list = compute_neighbor_list(
        system_coords, self.neighbor_cutoff, self.max_num_neighbors, None)

    return (system_coords, system_neighbor_list)


class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):
  """This class computes the featurization that corresponds to AtomicConvModel.

  This class computes featurizations needed for AtomicConvModel. Given a
  two molecular structures, it computes a number of useful geometric
  features. In particular, for each molecule and the global complex, it
  computes a coordinates matrix of size (N_atoms, 3) where N_atoms is the
  number of atoms. It also computes a neighbor-list, a dictionary with
  N_atoms elements where neighbor-list[i] is a list of the atoms the i-th
  atom has as neighbors. In addition, it computes a z-matrix for the
  molecule which is an array of shape (N_atoms,) that contains the atomic
  number of that atom.

  Since the featurization computes these three quantities for each of the
  two molecules and the complex, a total of 9 quantities are returned for
  each complex. Note that for efficiency, fragments of the molecules can be
  provided rather than the full molecules themselves.
  """

  def __init__(self,
               frag1_num_atoms,
               frag2_num_atoms,
               complex_num_atoms,
               max_num_neighbors,
               neighbor_cutoff,
               strip_hydrogens=True):
    self.frag1_num_atoms = frag1_num_atoms
    self.frag2_num_atoms = frag2_num_atoms
    self.complex_num_atoms = complex_num_atoms
    self.max_num_neighbors = max_num_neighbors
    self.neighbor_cutoff = neighbor_cutoff
    self.strip_hydrogens = strip_hydrogens
    self.neighborlist_featurizer = NeighborListComplexAtomicCoordinates(
        self.max_num_neighbors, self.neighbor_cutoff)

  def _featurize_complex(self, mol_pdb_file, protein_pdb_file):
    try:
      frag1_coords, frag1_mol = rdkit_util.load_molecule(mol_pdb_file)
      frag2_coords, frag2_mol = rdkit_util.load_molecule(protein_pdb_file)
    except MoleculeLoadException:
      # Currently handles loading failures by returning None
      # TODO: Is there a better handling procedure?
      logging.warning("Some molecules cannot be loaded by Rdkit. Skipping")
      return None
    system_mol = rdkit_util.merge_molecules(frag1_mol, frag2_mol)
    system_coords = rdkit_util.get_xyz_from_mol(system_mol)

    frag1_coords, frag1_mol = self._strip_hydrogens(frag1_coords, frag1_mol)
    frag2_coords, frag2_mol = self._strip_hydrogens(frag2_coords, frag2_mol)
    system_coords, system_mol = self._strip_hydrogens(system_coords, system_mol)

    try:
      frag1_coords, frag1_neighbor_list, frag1_z = self.featurize_mol(
          frag1_coords, frag1_mol, self.frag1_num_atoms)

      frag2_coords, frag2_neighbor_list, frag2_z = self.featurize_mol(
          frag2_coords, frag2_mol, self.frag2_num_atoms)

      system_coords, system_neighbor_list, system_z = self.featurize_mol(
          system_coords, system_mol, self.complex_num_atoms)
    except ValueError as e:
      logging.warning(
          "max_atoms was set too low. Some complexes too large and skipped")
      return None

    return frag1_coords, frag1_neighbor_list, frag1_z, frag2_coords, frag2_neighbor_list, frag2_z, \
           system_coords, system_neighbor_list, system_z

  def get_Z_matrix(self, mol, max_atoms):
    if len(mol.GetAtoms()) > max_atoms:
      raise ValueError("A molecule is larger than permitted by max_atoms. "
                       "Increase max_atoms and try again.")
    return pad_array(
        np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]), max_atoms)

  def featurize_mol(self, coords, mol, max_num_atoms):
    logging.info("Featurizing molecule of size: %d", len(mol.GetAtoms()))
    neighbor_list = compute_neighbor_list(coords, self.neighbor_cutoff,
                                          self.max_num_neighbors, None)
    z = self.get_Z_matrix(mol, max_num_atoms)
    z = pad_array(z, max_num_atoms)
    coords = pad_array(coords, (max_num_atoms, 3))
    return coords, neighbor_list, z

  def _strip_hydrogens(self, coords, mol):

    class MoleculeShim(object):
      """
      Shim of a Molecule which supports #GetAtoms()
      """

      def __init__(self, atoms):
        self.atoms = [AtomShim(x) for x in atoms]

      def GetAtoms(self):
        return self.atoms

    class AtomShim(object):

      def __init__(self, atomic_num):
        self.atomic_num = atomic_num

      def GetAtomicNum(self):
        return self.atomic_num

    if not self.strip_hydrogens:
      return coords, mol
    indexes_to_keep = []
    atomic_numbers = []
    for index, atom in enumerate(mol.GetAtoms()):
      if atom.GetAtomicNum() != 1:
        indexes_to_keep.append(index)
        atomic_numbers.append(atom.GetAtomicNum())
    mol = MoleculeShim(atomic_numbers)
    coords = coords[indexes_to_keep]
    return coords, mol
+0 −53
Original line number Diff line number Diff line
"""
Featurizes proposed binding pockets.
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2017, Stanford University"
__license__ = "MIT"

import numpy as np
from deepchem.utils.save import log
from deepchem.feat import Featurizer


class BindingPocketFeaturizer(Featurizer):
  """
  Featurizes binding pockets with information about chemical environments.
  """

  residues = [
      "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE",
      "LEU", "LYS", "MET", "PHE", "PRO", "PYL", "SER", "SEC", "THR", "TRP",
      "TYR", "VAL", "ASX", "GLX"
  ]

  n_features = len(residues)

  def featurize(self,
                protein_file,
                pockets,
                pocket_atoms_map,
                pocket_coords,
                verbose=False):
    """
    Calculate atomic coodinates.
    """
    import mdtraj
    protein = mdtraj.load(protein_file)
    n_pockets = len(pockets)
    n_residues = len(BindingPocketFeaturizer.residues)
    res_map = dict(zip(BindingPocketFeaturizer.residues, range(n_residues)))
    all_features = np.zeros((n_pockets, n_residues))
    for pocket_num, (pocket, coords) in enumerate(zip(pockets, pocket_coords)):
      pocket_atoms = pocket_atoms_map[pocket]
      for ind, atom in enumerate(pocket_atoms):
        atom_name = str(protein.top.atom(atom))
        # atom_name is of format RESX-ATOMTYPE
        # where X is a 1 to 4 digit number
        residue = atom_name[:3]
        if residue not in res_map:
          log("Warning: Non-standard residue in PDB file", verbose)
          continue
        atomtype = atom_name.split("-")[1]
        all_features[pocket_num, res_map[residue]] += 1
    return all_features

deepchem/feat/coulomb_matrices.py

deleted100644 → 0
+0 −271

File deleted.

Preview size limit exceeded, changes collapsed.

deepchem/feat/fingerprints.py

deleted100644 → 0
+0 −108
Original line number Diff line number Diff line
"""
Topological fingerprints.
"""
__author__ = "Steven Kearnes"
__copyright__ = "Copyright 2014, Stanford University"
__license__ = "MIT"

from deepchem.feat import Featurizer


class CircularFingerprint(Featurizer):
  """
  Circular (Morgan) fingerprints.

  Parameters
  ----------
  radius : int, optional (default 2)
      Fingerprint radius.
  size : int, optional (default 2048)
      Length of generated bit vector.
  chiral : bool, optional (default False)
      Whether to consider chirality in fingerprint generation.
  bonds : bool, optional (default True)
      Whether to consider bond order in fingerprint generation.
  features : bool, optional (default False)
      Whether to use feature information instead of atom information; see
      RDKit docs for more info.
  sparse : bool, optional (default False)
      Whether to return a dict for each molecule containing the sparse
      fingerprint.
  smiles : bool, optional (default False)
      Whether to calculate SMILES strings for fragment IDs (only applicable
      when calculating sparse fingerprints).
  """
  name = 'circular'

  def __init__(self,
               radius=2,
               size=2048,
               chiral=False,
               bonds=True,
               features=False,
               sparse=False,
               smiles=False):
    self.radius = radius
    self.size = size
    self.chiral = chiral
    self.bonds = bonds
    self.features = features
    self.sparse = sparse
    self.smiles = smiles

  def _featurize(self, mol):
    """
    Calculate circular fingerprint.

    Parameters
    ----------
    mol : RDKit Mol
        Molecule.
    """
    from rdkit import Chem
    from rdkit.Chem import rdMolDescriptors
    if self.sparse:
      info = {}
      fp = rdMolDescriptors.GetMorganFingerprint(
          mol,
          self.radius,
          useChirality=self.chiral,
          useBondTypes=self.bonds,
          useFeatures=self.features,
          bitInfo=info)
      fp = fp.GetNonzeroElements()  # convert to a dict

      # generate SMILES for fragments
      if self.smiles:
        fp_smiles = {}
        for fragment_id, count in fp.items():
          root, radius = info[fragment_id][0]
          env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, root)
          frag = Chem.PathToSubmol(mol, env)
          smiles = Chem.MolToSmiles(frag)
          fp_smiles[fragment_id] = {'smiles': smiles, 'count': count}
        fp = fp_smiles
    else:
      fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(
          mol,
          self.radius,
          nBits=self.size,
          useChirality=self.chiral,
          useBondTypes=self.bonds,
          useFeatures=self.features)
    return fp

  def __hash__(self):
    return hash((self.radius, self.size, self.chiral, self.bonds, self.features,
                 self.sparse, self.smiles))

  def __eq__(self, other):
    if not isinstance(self, other.__class__):
      return False
    return self.radius == other.radius and \
           self.size == other.size and \
           self.chiral == other.chiral and \
           self.bonds == other.bonds and \
           self.features == other.features and \
           self.sparse == other.sparse and \
           self.smiles == other.smiles
Loading