Unverified Commit 344b1ff7 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1992 from deepchem/featurizers

Introduce MolecularFeaturizer
parents 99831b73 d7fa74b8
Loading
Loading
Loading
Loading
+1 −4
Original line number Diff line number Diff line
"""
Making it easy to import in classes.
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

from deepchem.feat.base_classes import Featurizer
from deepchem.feat.base_classes import MolecularFeaturizer
from deepchem.feat.base_classes import ComplexFeaturizer
from deepchem.feat.base_classes import UserDefinedFeaturizer
from deepchem.feat.graph_features import ConvMolFeaturizer
+0 −4
Original line number Diff line number Diff line
"""
Atomic coordinate featurizer.
"""
__author__ = "Joseph Gomes and Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import logging
import numpy as np
from deepchem.utils.save import log
+104 −21
Original line number Diff line number Diff line
@@ -6,9 +6,7 @@ import types
import numpy as np
import multiprocessing

__author__ = "Steven Kearnes"
__copyright__ = "Copyright 2014, Stanford University"
__license__ = "BSD 3-clause"
logger = logging.getLogger(__name__)


def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message):
@@ -16,6 +14,58 @@ def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message):
  return featurizer._featurize_complex(mol_pdb_file, protein_pdb_file)


class Featurizer(object):
  """Abstract class for calculating a set of features for a datapoint.

  This class is abstract and cannot be invoked directly. You'll
  likely only interact with this class if you're a developer. In
  that case, you might want to make a child class which
  implements the `_featurize` method for calculating features for
  a single datapoints if you'd like to make a featurizer for a
  new datatype.
  """

  def featurize(self, datapoints, log_every_n=1000):
    """Calculate features for datapoints.

    Parameters
    ----------
    datapoints: iterable 
       A sequence of objects that you'd like to featurize. Subclassses of
       `Featurizer` should instantiate the `_featurize` method that featurizes
       objects in the sequence.

    Returns
    -------
    A numpy array containing a featurized representation of `datapoints`.
    """
    datapoints = list(datapoints)
    features = []
    for i, point in enumerate(datapoints):
      if i % log_every_n == 0:
        logger.info("Featurizing datapoint %i" % i)
      try:
        features.append(self._featurize(point))
      except:
        logger.warning(
            "Failed to featurize datapoint %d. Appending empty array")
        features.append(np.array([]))

    features = np.asarray(features)
    return features

  def __call__(self, datapoints):
    """Calculate features for datapoints.

    Parameters
    ----------
    datapoints: object 
       Any blob of data you like. Subclasss should instantiate
       this. 
    """
    return self.featurize(datapoints)


class ComplexFeaturizer(object):
  """"
  Abstract class for calculating features for mol/protein complexes.
@@ -73,29 +123,62 @@ class ComplexFeaturizer(object):
    raise NotImplementedError('Featurizer is not defined.')


class Featurizer(object):
  """
  Abstract class for calculating a set of features for a molecule.
class MolecularFeaturizer(Featurizer):
  """Abstract class for calculating a set of features for a
  molecule.

  Child classes implement the _featurize method for calculating features
  for a single molecule.
  """
  The defining feature of a `MolecularFeaturizer` is that it
  uses SMILES strings and RDKIT molecule objects to represent
  small molecules. All other featurizers which are subclasses of
  this class should plan to process input which comes as smiles
  strings or RDKIT molecules. 

  def featurize(self, mols, verbose=True, log_every_n=1000):
  Child classes need to implement the _featurize method for
  calculating features for a single molecule.

  Note
  ----
  In general, subclasses of this class will require RDKit to be installed.
  """
    Calculate features for molecules.

  def featurize(self, molecules, log_every_n=1000):
    """Calculate features for molecules.

    Parameters
    ----------
    mols : iterable
        RDKit Mol objects.
    molecules: RDKit Mol / SMILES string /iterable
        RDKit Mol, or SMILES string or iterable sequence of RDKit mols/SMILES
        strings.

    Returns
    -------
    A numpy array containing a featurized representation of
    `datapoints`.
    """
    mols = list(mols)
    try:
      from rdkit import Chem
      from rdkit.Chem.rdchem import Mol
    except ModuleNotFoundError:
      raise ValueError("This class requires RDKit to be installed.")
    # Special case handling of single molecule
    if isinstance(molecules, str) or isinstance(molecules, Mol):
      molecules = [molecules]
    else:
      # Convert iterables to list
      molecutes = list(molecules)
    features = []
    for i, mol in enumerate(mols):
      if mol is not None:
    for i, mol in enumerate(molecules):
      if i % log_every_n == 0:
        logger.info("Featurizing datapoint %i" % i)
      try:
        # Process only case of SMILES strings.
        if isinstance(mol, str):
          # mol must be a SMILES string so parse
          mol = Chem.MolFromSmiles(mol)
        features.append(self._featurize(mol))
      else:
      except:
        logger.warning(
            "Failed to featurize datapoint %d. Appending empty array")
        features.append(np.array([]))

    features = np.asarray(features)
@@ -112,16 +195,16 @@ class Featurizer(object):
    """
    raise NotImplementedError('Featurizer is not defined.')

  def __call__(self, mols):
  def __call__(self, molecules):
    """
    Calculate features for molecules.

    Parameters
    ----------
    mols : iterable
        RDKit Mol objects.
    molecules: iterable
        An iterable yielding RDKit Mol objects or SMILES strings.
    """
    return self.featurize(mols)
    return self.featurize(molecules)


class UserDefinedFeaturizer(Featurizer):
+40 −16
Original line number Diff line number Diff line
"""
Basic molecular features.
"""
__author__ = "Steven Kearnes"
__copyright__ = "Copyright 2014, Stanford University"
__license__ = "MIT"

from deepchem.feat import Featurizer
import numpy as np
from deepchem.feat.base_classes import MolecularFeaturizer


class MolecularWeight(Featurizer):
  """
  Molecular weight.
class MolecularWeight(MolecularFeaturizer):
  """Molecular weight.

  Note
  ----
  This class requires RDKit to be installed.
  """
  name = ['mw', 'molecular_weight']

  def _featurize(self, mol):
    """
@@ -22,21 +22,37 @@ class MolecularWeight(Featurizer):
    ----------
    mol : RDKit Mol
        Molecule.

    Returns
    -------
    np.ndarray of length 1 containing the molecular weight.
    """
    try:
      from rdkit.Chem import Descriptors
    except ModuleNotFoundError:
      raise ValueError("This class requires RDKit to be installed.")
    wt = Descriptors.ExactMolWt(mol)
    wt = [wt]
    return wt
    return np.asarray(wt)


class RDKitDescriptors(Featurizer):
  """
  RDKit descriptors.
class RDKitDescriptors(MolecularFeaturizer):
  """RDKit descriptors.

  This class comptues a list of chemical descriptors using RDKit.

  See http://rdkit.org/docs/GettingStartedInPython.html
  #list-of-available-descriptors.

  Attributes
  ----------
  descriptors: np.ndarray
    1D array of RDKit descriptor names used in this class.

  Note
  ----
  This class requires RDKit to be installed.
  """
  name = 'descriptors'

  # (ytz): This is done to avoid future compatibility issues like inclusion of
  # the 3D descriptors or changing the feature size.
@@ -69,9 +85,12 @@ class RDKitDescriptors(Featurizer):
  ])

  def __init__(self):
    try:
      from rdkit.Chem import Descriptors
    except ModuleNotFoundError:
      raise ValueError("This class requires RDKit to be installed.")
    self.descriptors = []
    self.descList = []
    from rdkit.Chem import Descriptors
    for descriptor, function in Descriptors.descList:
      if descriptor in self.allowedDescriptors:
        self.descriptors.append(descriptor)
@@ -85,8 +104,13 @@ class RDKitDescriptors(Featurizer):
    ----------
    mol : RDKit Mol
        Molecule.

    Returns
    -------
    rval: np.ndarray
      1D array of RDKit descriptors for `mol`
    """
    rval = []
    for desc_name, function in self.descList:
      rval.append(function(mol))
    return rval
    return np.asarray(rval)
+95 −21
Original line number Diff line number Diff line
@@ -3,24 +3,38 @@ Generate coulomb matrices for molecules.

See Montavon et al., _New Journal of Physics_ __15__ (2013) 095003.
"""
__author__ = "Steven Kearnes"
__copyright__ = "Copyright 2014, Stanford University"
__license__ = "MIT"

import numpy as np
import deepchem as dc
from deepchem.feat import Featurizer
from deepchem.feat.base_classes import MolecularFeaturizer
from deepchem.utils import pad_array
from deepchem.feat.atomic_coordinates import AtomicCoordinates


class BPSymmetryFunctionInput(Featurizer):
  """
  Calculate Symmetry Function for each atom in the molecules
  Methods described in https://journals.aps.org/prl/pdf/10.1103/PhysRevLett.98.146401
class BPSymmetryFunctionInput(MolecularFeaturizer):
  """Calculate Symmetry Function for each atom in the molecules

  This method is described in [1]_ 

  References
  ----------
  .. [1] Behler, Jörg, and Michele Parrinello. "Generalized neural-network
         representation of high-dimensional potential-energy surfaces." Physical
         review letters 98.14 (2007): 146401.

  Note
  ----
  This class requires RDKit to be installed.
  """

  def __init__(self, max_atoms):
    """Initialize this featurizer.

    Parameters
    ----------
    max_atoms: int
      The maximum number of atoms expected for molecules this featurizer will
      process.
    """
    self.max_atoms = max_atoms

  def _featurize(self, mol):
@@ -34,9 +48,11 @@ class BPSymmetryFunctionInput(Featurizer):
    return np.pad(features, ((0, self.max_atoms - n_atoms), (0, 0)), 'constant')


class CoulombMatrix(Featurizer):
  """
  Calculate Coulomb matrices for molecules.
class CoulombMatrix(MolecularFeaturizer):
  """Calculate Coulomb matrices for molecules.

  Coulomb matrices provide a representation of the electronic structure of a
  molecule. This method is described in [1]_.

  Parameters
  ----------
@@ -55,14 +71,24 @@ class CoulombMatrix(Featurizer):
  seed : int, optional
      Random seed.

  Example:

  Example
  -------
  >>> featurizers = dc.feat.CoulombMatrix(max_atoms=23)
  >>> input_file = 'deepchem/feat/tests/data/water.sdf' # really backed by water.sdf.csv
  >>> tasks = ["atomization_energy"]
  >>> loader = dc.data.SDFLoader(tasks, featurizer=featurizers)
  >>> dataset = loader.create_dataset(input_file) #doctest: +ELLIPSIS
  Reading structures from deepchem/feat/tests/data/water.sdf.

  References
  ----------
  .. [1] Montavon, Grégoire, et al. "Learning invariant representations of
         molecules for atomization energy prediction." Advances in neural information
         processing systems. 2012.

  Note
  ----
  This class requires RDKit to be installed.
  """
  conformers = True
  name = 'coulomb_matrix'
@@ -74,6 +100,28 @@ class CoulombMatrix(Featurizer):
               upper_tri=False,
               n_samples=1,
               seed=None):
    """Initialize this featurizer.

    Parameters
    ----------
    max_atoms: int
      The maximum number of atoms expected for molecules this featurizer will
      process.
    remove_hydrogens: bool, optional (default False)
      If True, remove hydrogens before processing them.
    randomize: bool, optional (default False)
      If True, use method `randomize_coulomb_matrices` to randomize Coulomb matrices.
    upper_tri: bool, optional (default False)
      Generate only upper triangle part of Coulomb matrices.
    n_samples: int, optional (default 1)
      If `randomize` is set to True, the number of random samples to draw.
    seed: int, optional (default None)
      Random seed to use.
    """
    try:
      from rdkit import Chem
    except ModuleNotFoundError:
      raise ValueError("This class requires RDKit to be installed.")
    self.max_atoms = int(max_atoms)
    self.remove_hydrogens = remove_hydrogens
    self.randomize = randomize
@@ -141,9 +189,7 @@ class CoulombMatrix(Featurizer):
    return rval

  def randomize_coulomb_matrix(self, m):
    """
    Randomize a Coulomb matrix as decribed in Montavon et al.,
    New Journal of Physics, 15, (2013), 095003:
    """Randomize a Coulomb matrix as decribed in [1]_:

    1. Compute row norms for M in a vector row_norms.
    2. Sample a zero-mean unit-variance noise vector e with dimension
@@ -159,6 +205,10 @@ class CoulombMatrix(Featurizer):
        Number of random matrices to generate.
    seed : int, optional
        Random seed.

    References
    ----------
    .. [1] Montavon et al., New Journal of Physics, 15, (2013), 095003
    """
    rval = []
    row_norms = np.asarray([np.linalg.norm(row) for row in m], dtype=float)
@@ -196,8 +246,10 @@ class CoulombMatrix(Featurizer):


class CoulombMatrixEig(CoulombMatrix):
  """
  Calculate the eigenvales of Coulomb matrices for molecules.
  """Calculate the eigenvalues of Coulomb matrices for molecules.

  This featurizer computes the eigenvalues of the Coulomb matrices for provided
  molecules. Coulomb matrices are described in [1]_.

  Parameters
  ----------
@@ -214,14 +266,20 @@ class CoulombMatrixEig(CoulombMatrix):
  seed : int, optional
      Random seed.

  Example:

  Example
  -------
  >>> featurizers = dc.feat.CoulombMatrixEig(max_atoms=23)
  >>> input_file = 'deepchem/feat/tests/data/water.sdf' # really backed by water.sdf.csv
  >>> tasks = ["atomization_energy"]
  >>> loader = dc.data.SDFLoader(tasks, featurizer=featurizers)
  >>> dataset = loader.create_dataset(input_file) #doctest: +ELLIPSIS
  Reading structures from deepchem/feat/tests/data/water.sdf.

  References
  ----------
  .. [1] Montavon, Grégoire, et al. "Learning invariant representations of
         molecules for atomization energy prediction." Advances in neural information
         processing systems. 2012.
  """

  conformers = True
@@ -233,6 +291,22 @@ class CoulombMatrixEig(CoulombMatrix):
               randomize=False,
               n_samples=1,
               seed=None):
    """Initialize this featurizer.

    Parameters
    ----------
    max_atoms: int
      The maximum number of atoms expected for molecules this featurizer will
      process.
    remove_hydrogens: bool, optional (default False)
      If True, remove hydrogens before processing them.
    randomize: bool, optional (default False)
      If True, use method `randomize_coulomb_matrices` to randomize Coulomb matrices.
    n_samples: int, optional (default 1)
      If `randomize` is set to True, the number of random samples to draw.
    seed: int, optional (default None)
      Random seed to use.
    """
    self.max_atoms = int(max_atoms)
    self.remove_hydrogens = remove_hydrogens
    self.randomize = randomize
Loading