Commit 944e980c authored by evanfeinberg's avatar evanfeinberg
Browse files

surgery

parent 242ff9b9
Loading
Loading
Loading
Loading
+256 −0
Original line number Original line Diff line number Diff line
"""
Feature calculations.
"""
import types
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdGeometry, rdMolTransforms
from deepchem.utils.ob_utils import Ionizer

__author__ = "Steven Kearnes"
__copyright__ = "Copyright 2014, Stanford University"
__license__ = "BSD 3-clause"


def get_featurizers():
    """Compile a dict mapping strings to featurizer classes."""

    # import all Featurizer subclasses so __subclasses__ will work
    # these have to be local imports to avoid circular imports
    from .basic import MolecularWeight, SimpleDescriptors
    from .coulomb_matrices import CoulombMatrix
    from .dragon import DragonDescriptors
    from .esp import ESP
    from .fingerprints import CircularFingerprint
    from .images import MolImage
    from .scaffolds import ScaffoldGenerator
    from .shape_grid import ShapeGrid

    featurizers = {}
    for klass in Featurizer.__subclasses__():
        assert klass.name is not None, (klass.__name__ +
                                        " 'name' attribute is None.")
        if isinstance(klass.name, list):
            for name in klass.name:
                assert name not in featurizers
                featurizers[name] = klass
        else:
            assert klass.name not in featurizers
            featurizers[klass.name] = klass
    return featurizers


def resolve_featurizer(name):
    """
    Resolve featurizer class from a string.

    Parameters
    ----------
    name : str
        Featurizer name.
    """
    return get_featurizers()[name]

class ComplexFeaturizer(object):
  """"
  Abstract class for calculating features for mol/protein complexes.

  Class Attributes
  ----------------
  name : str or list
      Name (or names) of this featurizer (used for scripting).
  """
  name = None

  def featurize_complexes(self, mol_pdbs, protein_pdbs, log_every_n=1000):
    """
    Calculate features for mol/protein complexes.

    Parameters
    ----------
    mol_pdbs: list
      List of PDBs for molecules. Each PDB should be a list of lines of the
      PDB file.
    protein_pdbs: list
      List of PDBs for proteins. Each PDB should be a list of lines of the
      PDB file.
    """

    features = []
    for i, (mol_pdb, protein_pdb) in enumerate(zip(mol_pdbs, protein_pdbs)):
      if i % log_every_n == 0:
        print("Featurizing %d / %d" % (i, len(mol_pdbs)))
      features.append(self._featurize_complex(mol_pdb, protein_pdb))
    features = np.asarray(features)
    return features

  def _featurize_complex(self, mol_pdb, complex_pdb):
    """
    Calculate features for single mol/protein complex.

    Parameters
    ----------
    mol_pdb: list
      Should be a list of lines of the PDB file.
    complex_pdb: list
      Should be a list of lines of the PDB file.
    """
    raise NotImplementedError('Featurizer is not defined.')

class Featurizer(object):
  """
  Abstract class for calculating a set of features for a molecule.

  Child classes implement the _featurize method for calculating features
  for a single molecule. The feature matrix returned by featurize has a
  shape that is inferred from the shape of the features returned by
  _featurize, and is either indexed by molecules or by molecules and
  conformers depending on the value of the conformers class attribute.

  Class Attributes
  ----------------
  conformers : bool, optional (default False)
      Whether features are calculated for conformers. If True, the first
      two axes of the feature matrix will index molecules and conformers,
      respectively. If False, only molecule-level features are calculated
      and the feature matrix will not have a separate conformer dimension.
      This is a class attribute because some featurizers take 3D
      conformation into account and others do not, and this is not
      typically an instance-level decision.
  name : str or list
      Name (or names) of this featurizer (used for scripting).
  topo_view : bool (default False)
      Whether the calculated features represent a topological view of the
      data.
  """
  conformers = False
  name = None
  topo_view = False

  def featurize(self, mols, parallel=False, client_kwargs=None,
                view_flags=None):
    """
    Calculate features for molecules.

    Parameters
    ----------
    mols : iterable
        RDKit Mol objects.
    parallel : bool, optional
        Whether to train subtrainers in parallel using
        IPython.parallel (default False).
    client_kwargs : dict, optional
        Keyword arguments for IPython.parallel Client.
    view_flags : dict, optional
        Flags for IPython.parallel LoadBalancedView.
    """
    if self.conformers and isinstance(mols, types.GeneratorType):
      mols = list(mols)

    if parallel:
      from IPython.parallel import Client

      if client_kwargs is None:
          client_kwargs = {}
      if view_flags is None:
          view_flags = {}
      client = Client(**client_kwargs)
      client.direct_view().use_dill()  # use dill
      view = client.load_balanced_view()
      view.set_flags(**view_flags)
      call = view.map(self._featurize, mols, block=False)
      features = call.get()

      # get output from engines
      call.display_outputs()

    else:
      features = [self._featurize(mol) for mol in mols]

    if self.conformers:
      features = self.conformer_container(mols, features)
    else:
      features = np.asarray(features)
    return features

  def _featurize(self, mol):
    """
    Calculate features for a single molecule.

    Parameters
    ----------
    mol : RDKit Mol
        Molecule.
    """
    raise NotImplementedError('Featurizer is not defined.')

  def __call__(self, mols, parallel=False, client_kwargs=None,
               view_flags=None):
    """
    Calculate features for molecules.

    Parameters
    ----------
    mols : iterable
        RDKit Mol objects.
    parallel : bool, optional
        Whether to train subtrainers in parallel using
        IPython.parallel (default False).
    client_kwargs : dict, optional
        Keyword arguments for IPython.parallel Client.
    view_flags : dict, optional
        Flags for IPython.parallel LoadBalancedView.
    """
    return self.featurize(mols, parallel, client_kwargs, view_flags)

  def conformer_container(self, mols, features):
    """
    Put features into a container with an extra dimension for
    conformers. Conformer indices that are not used are masked.

    For example, if mols contains 3 molecules with 1, 2, 5 conformers,
    respectively, then the final container will have 3 entries on its
    first axis and 5 entries on its second axis. The remaining axes
    correspond to feature dimensions.

    Parameters
    ----------
    mols : iterable
        RDKit Mol objects.
    features : list
        Features calculated for molecule conformers. Each element
        corresponds to the features for a molecule and should be an
        ndarray with conformers on the first axis.
    """

    # get the maximum number of conformers
    max_confs = max([mol.GetNumConformers() for mol in mols])
    if not max_confs:
      max_confs = 1

    # construct the new container
    # - first axis = # mols
    # - second axis = max # conformers
    # - remaining axes = determined by feature shape
    features_shape = None
    for i in xrange(len(features)):
      for j in xrange(len(features[i])):
        if features[i][j] is not None:
          features_shape = features[i][0].shape
          break
      if features_shape is not None:
        break
    if features_shape is None:
      raise ValueError('Cannot find any features.')
    shape = (len(mols), max_confs) + features_shape
    x = np.ma.masked_all(shape)

    # fill in the container
    for i, (mol, mol_features) in enumerate(zip(mols, features)):
      n_confs = max(mol.GetNumConformers(), 1)
      try:
        x[i, :n_confs] = mol_features
      except ValueError:  # handle None conformer values
        for j in xrange(n_confs):
          x[i, j] = mol_features[j]
    return x
+62 −0
Original line number Original line Diff line number Diff line
"""
Basic molecular features.
"""

__author__ = "Steven Kearnes"
__copyright__ = "Copyright 2014, Stanford University"
__license__ = "BSD 3-clause"

from rdkit.Chem import Descriptors

from deepchem.featurizers import Featurizer


class MolecularWeight(Featurizer):
    """
    Molecular weight.
    """
    name = ['mw', 'molecular_weight']

    def _featurize(self, mol):
        """
        Calculate molecular weight.

        Parameters
        ----------
        mol : RDKit Mol
            Molecule.
        """
        wt = Descriptors.ExactMolWt(mol)
        wt = [wt]
        return wt


class SimpleDescriptors(Featurizer):
    """
    RDKit descriptors.

    See http://rdkit.org/docs/GettingStartedInPython.html
    #list-of-available-descriptors.
    """
    name = 'descriptors'

    def __init__(self):
        self.descriptors = []
        self.functions = []
        for descriptor, function in Descriptors.descList:
            self.descriptors.append(descriptor)
            self.functions.append(function)

    def _featurize(self, mol):
        """
        Calculate RDKit descriptors.

        Parameters
        ----------
        mol : RDKit Mol
            Molecule.
        """
        rval = []
        for function in self.functions:
            rval.append(function(mol))
        return rval
+156 −0
Original line number Original line Diff line number Diff line
"""
Generate coulomb matrices for molecules.

See Montavon et al., _New Journal of Physics_ __15__ (2013) 095003.
"""

__author__ = "Steven Kearnes"
__copyright__ = "Copyright 2014, Stanford University"
__license__ = "BSD 3-clause"

import numpy as np

from rdkit import Chem

from deepchem.featurizers import Featurizer
from vs_utils.utils import pad_array


class CoulombMatrix(Featurizer):
    """
    Calculate Coulomb matrices for molecules.

    Parameters
    ----------
    max_atoms : int
        Maximum number of atoms for any molecule in the dataset. Used to
        pad the Coulomb matrix.
    remove_hydrogens : bool, optional (default True)
        Whether to remove hydrogens before constructing Coulomb matrix.
    randomize : bool, optional (default True)
        Whether to randomize Coulomb matrices to remove dependence on atom
        index order.
    n_samples : int, optional (default 1)
        Number of random Coulomb matrices to generate if randomize is True.
    seed : int, optional
        Random seed.
    """
    conformers = True
    name = 'coulomb_matrix'

    def __init__(self, max_atoms, remove_hydrogens=True, randomize=True,
                 n_samples=1, seed=None):
        self.max_atoms = int(max_atoms)
        self.remove_hydrogens = remove_hydrogens
        self.randomize = randomize
        self.n_samples = n_samples
        if seed is not None:
            seed = int(seed)
        self.seed = seed

    def _featurize(self, mol):
        """
        Calculate Coulomb matrices for molecules. If extra randomized
        matrices are generated, they are treated as if they are features
        for additional conformers.

        Since Coulomb matrices are symmetric, only the (flattened) upper
        triangular portion is returned.

        Parameters
        ----------
        mol : RDKit Mol
            Molecule.
        """
        features = self.coulomb_matrix(mol)
        features = [f[np.triu_indices_from(f)] for f in features]
        features = np.asarray(features)
        return features

    def coulomb_matrix(self, mol):
        """
        Generate Coulomb matrices for each conformer of the given molecule.

        Parameters
        ----------
        mol : RDKit Mol
            Molecule.
        """
        if self.remove_hydrogens:
            mol = Chem.RemoveHs(mol)
        n_atoms = mol.GetNumAtoms()
        z = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
        rval = []
        for conf in mol.GetConformers():
            d = self.get_interatomic_distances(conf)
            m = np.zeros((n_atoms, n_atoms))
            for i in xrange(mol.GetNumAtoms()):
                for j in xrange(mol.GetNumAtoms()):
                    if i == j:
                        m[i, j] = 0.5 * z[i] ** 2.4
                    elif i < j:
                        m[i, j] = (z[i] * z[j]) / d[i, j]
                        m[j, i] = m[i, j]
                    else:
                        continue
            if self.randomize:
                for random_m in self.randomize_coulomb_matrix(m):
                    random_m = pad_array(random_m, self.max_atoms)
                    rval.append(random_m)
            else:
                m = pad_array(m, self.max_atoms)
                rval.append(m)
        rval = np.asarray(rval)
        return rval

    def randomize_coulomb_matrix(self, m):
        """
        Randomize a Coulomb matrix as decribed in Montavon et al., _New Journal
        of Physics_ __15__ (2013) 095003:

            1. Compute row norms for M in a vector row_norms.
            2. Sample a zero-mean unit-variance noise vector e with dimension
               equal to row_norms.
            3. Permute the rows and columns of M with the permutation that
               sorts row_norms + e.

        Parameters
        ----------
        m : ndarray
            Coulomb matrix.
        n_samples : int, optional (default 1)
            Number of random matrices to generate.
        seed : int, optional
            Random seed.
        """
        rval = []
        row_norms = np.asarray([np.linalg.norm(row) for row in m], dtype=float)
        rng = np.random.RandomState(self.seed)
        for i in xrange(self.n_samples):
            e = rng.normal(size=row_norms.size)
            p = np.argsort(row_norms + e)
            new = m[p][:, p]  # permute rows first, then columns
            rval.append(new)
        return rval

    @staticmethod
    def get_interatomic_distances(conf):
        """
        Get interatomic distances for atoms in a molecular conformer.

        Parameters
        ----------
        conf : RDKit Conformer
            Molecule conformer.
        """
        n_atoms = conf.GetNumAtoms()
        coords = [conf.GetAtomPosition(i) for i in xrange(n_atoms)]
        d = np.zeros((n_atoms, n_atoms), dtype=float)
        for i in xrange(n_atoms):
            for j in xrange(n_atoms):
                if i < j:
                    d[i, j] = coords[i].Distance(coords[j])
                    d[j, i] = d[i, j]
                else:
                    continue
        return d
+3 −3
Original line number Original line Diff line number Diff line
@@ -10,13 +10,13 @@ import pandas as pd
import numpy as np
import numpy as np
import csv
import csv
from rdkit import Chem
from rdkit import Chem
from vs_utils.features.fingerprints import CircularFingerprint
from deepchem.featurizers.fingerprints import CircularFingerprint
from vs_utils.features.basic import SimpleDescriptors
from deepchem.featurizers.basic import SimpleDescriptors
from deepchem.utils.save import save_to_disk
from deepchem.utils.save import save_to_disk
from deepchem.utils.save import load_from_disk
from deepchem.utils.save import load_from_disk
from deepchem.utils.save import load_pandas_from_disk
from deepchem.utils.save import load_pandas_from_disk
from vs_utils.utils import ScaffoldGenerator
from vs_utils.utils import ScaffoldGenerator
from vs_utils.features.nnscore import NNScoreComplexFeaturizer
from deepchem.featurizers.nnscore import NNScoreComplexFeaturizer
import multiprocessing as mp
import multiprocessing as mp
from functools import partial
from functools import partial


+82 −0
Original line number Original line Diff line number Diff line
"""
Topological fingerprints.
"""

__author__ = "Steven Kearnes"
__copyright__ = "Copyright 2014, Stanford University"
__license__ = "BSD 3-clause"

from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

from deepchem.featurizers import Featurizer


class CircularFingerprint(Featurizer):
    """
    Circular (Morgan) fingerprints.

    Parameters
    ----------
    radius : int, optional (default 2)
        Fingerprint radius.
    size : int, optional (default 2048)
        Length of generated bit vector.
    chiral : bool, optional (default False)
        Whether to consider chirality in fingerprint generation.
    bonds : bool, optional (default True)
        Whether to consider bond order in fingerprint generation.
    features : bool, optional (default False)
        Whether to use feature information instead of atom information; see
        RDKit docs for more info.
    sparse : bool, optional (default False)
        Whether to return a dict for each molecule containing the sparse
        fingerprint.
    smiles : bool, optional (default False)
        Whether to calculate SMILES strings for fragment IDs (only applicable
        when calculating sparse fingerprints).
    """
    name = 'circular'

    def __init__(self, radius=2, size=2048, chiral=False, bonds=True,
                 features=False, sparse=False, smiles=False):
        self.radius = radius
        self.size = size
        self.chiral = chiral
        self.bonds = bonds
        self.features = features
        self.sparse = sparse
        self.smiles = smiles

    def _featurize(self, mol):
        """
        Calculate circular fingerprint.

        Parameters
        ----------
        mol : RDKit Mol
            Molecule.
        """
        if self.sparse:
            info = {}
            fp = rdMolDescriptors.GetMorganFingerprint(
                mol, self.radius, useChirality=self.chiral,
                useBondTypes=self.bonds, useFeatures=self.features,
                bitInfo=info)
            fp = fp.GetNonzeroElements()  # convert to a dict

            # generate SMILES for fragments
            if self.smiles:
                fp_smiles = {}
                for fragment_id, count in fp.items():
                    root, radius = info[fragment_id][0]
                    env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, root)
                    frag = Chem.PathToSubmol(mol, env)
                    smiles = Chem.MolToSmiles(frag)
                    fp_smiles[fragment_id] = {'smiles': smiles, 'count': count}
                fp = fp_smiles
        else:
            fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(
                mol, self.radius, nBits=self.size, useChirality=self.chiral,
                useBondTypes=self.bonds, useFeatures=self.features)
        return fp
Loading