Commit 067cc2d8 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

changes

parent 99831b73
Loading
Loading
Loading
Loading
+1 −4
Original line number Diff line number Diff line
"""
Making it easy to import in classes.
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

from deepchem.feat.base_classes import Featurizer
from deepchem.feat.base_classes import MolecularFeaturizer
from deepchem.feat.base_classes import ComplexFeaturizer
from deepchem.feat.base_classes import UserDefinedFeaturizer
from deepchem.feat.graph_features import ConvMolFeaturizer
+0 −4
Original line number Diff line number Diff line
"""
Atomic coordinate featurizer.
"""
__author__ = "Joseph Gomes and Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import logging
import numpy as np
from deepchem.utils.save import log
+86 −12
Original line number Diff line number Diff line
@@ -6,9 +6,7 @@ import types
import numpy as np
import multiprocessing

__author__ = "Steven Kearnes"
__copyright__ = "Copyright 2014, Stanford University"
__license__ = "BSD 3-clause"
logger = logging.getLogger(__name__)


def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message):
@@ -16,6 +14,53 @@ def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message):
  return featurizer._featurize_complex(mol_pdb_file, protein_pdb_file)


class Featurizer(object):
  """Abstract class for calculating a set of features for a datapoint.

  This class is abstract and cannot be invoked directly. You'll
  likely only interact with this class if you're a developer. In
  that case, you might want to make a child class which
  implements the `_featurize` method for calculating features for
  a single datapoints if you'd like to make a featurizer for a
  new datatype.
  """

  def featurize(self, datapoints, log_every_n=1000):
    """Calculate features for datapoints.

    Parameters
    ----------
    datapoints: object 
       Any blob of data you like. Subclasss should instantiate this. 

    Returns
    -------
    A numpy array containing a featurized representation of
    `datapoints`.
    """
    datapoints = list(datapoints)
    features = []
    for i, point in enumerate(datapoints):
      if point is not None:
        features.append(self._featurize(point))
      else:
        features.append(np.array([]))

    features = np.asarray(features)
    return features

  def __call__(self, datapoints):
    """Calculate features for datapoints.

    Parameters
    ----------
    datapoints: object 
       Any blob of data you like. Subclasss should instantiate
       this. 
    """
    return self.featurize(datapoints)


class ComplexFeaturizer(object):
  """"
  Abstract class for calculating features for mol/protein complexes.
@@ -73,27 +118,56 @@ class ComplexFeaturizer(object):
    raise NotImplementedError('Featurizer is not defined.')


class Featurizer(object):
  """
  Abstract class for calculating a set of features for a molecule.
class MolecularFeaturizer(object):
  """Abstract class for calculating a set of features for a
  molecule.

  The defining feature of a `MolecularFeaturizer` is that it
  uses SMILES strings and RDKIT molecule objecgs to represent
  small molecules. All other featurizers which are subclasses of
  this class should plan to process input which comes as smiles
  strings or RDKIT molecules. 

  Child classes implement the _featurize method for calculating features
  for a single molecule.
  Child classes need to implement the _featurize method for
  calculating features for a single molecule.

  Note
  ----
  In general, subclasses of this class will require RDKit to be installed.
  """

  def featurize(self, mols, verbose=True, log_every_n=1000):
    """
    Calculate features for molecules.
    """Calculate features for molecules.

    Parameters
    ----------
    mols : iterable
        RDKit Mol objects.
        RDKit Mol, or SMILES string, or filename for
        mol2/sdf/pdb/pdbqt file.

    Returns
    -------
    A numpy array containing a featurized representation of
    `datapoints`.
    """
    try:
      from rdkit import Chem
      from rdkit.Chem.rdchem import Mol
    except ModuleNotFoundError:
      raise ValueError("This class requires RDKit to be installed.")
    # Special case handling of single molecule
    if isinstance(mols, str) or isinstance(mols, Mol):
      mols = [mols]
    else:
      # Convert iterables to list
      mols = list(mols)
    features = []
    for i, mol in enumerate(mols):
      if mol is not None:
        # Process only case of SMILES strings.
        if isinstance(mol, str):
          # mol must be a SMILES string so parse
          mol = Chem.MolFromSmiles(mol)
        features.append(self._featurize(mol))
      else:
        features.append(np.array([]))
+28 −12
Original line number Diff line number Diff line
"""
Basic molecular features.
"""
__author__ = "Steven Kearnes"
__copyright__ = "Copyright 2014, Stanford University"
__license__ = "MIT"

from deepchem.feat import Featurizer
from deepchem.feat.base_classes import MolecularFeaturizer


class MolecularWeight(Featurizer):
  """
  Molecular weight.
class MolecularWeight(MolecularFeaturizer):
  """Molecular weight.

  Note
  ----
  This class requires RDKit to be installed.
  """
  name = ['mw', 'molecular_weight']

@@ -23,18 +23,26 @@ class MolecularWeight(Featurizer):
    mol : RDKit Mol
        Molecule.
    """
    try:
      from rdkit.Chem import Descriptors
    except ModuleNotFoundError:
      raise ValueError("This class requires RDKit to be installed.")
    wt = Descriptors.ExactMolWt(mol)
    wt = [wt]
    return wt


class RDKitDescriptors(Featurizer):
  """
  RDKit descriptors.
class RDKitDescriptors(MolecularFeaturizer):
  """RDKit descriptors.

  This class comptues a list of chemical descriptors using RDKit.

  See http://rdkit.org/docs/GettingStartedInPython.html
  #list-of-available-descriptors.

  Note
  ----
  This class requires RDKit to be installed.
  """
  name = 'descriptors'

@@ -69,9 +77,12 @@ class RDKitDescriptors(Featurizer):
  ])

  def __init__(self):
    try:
      from rdkit.Chem import Descriptors
    except ModuleNotFoundError:
      raise ValueError("This class requires RDKit to be installed.")
    self.descriptors = []
    self.descList = []
    from rdkit.Chem import Descriptors
    for descriptor, function in Descriptors.descList:
      if descriptor in self.allowedDescriptors:
        self.descriptors.append(descriptor)
@@ -85,6 +96,11 @@ class RDKitDescriptors(Featurizer):
    ----------
    mol : RDKit Mol
        Molecule.

    Returns
    -------
    rval: np.ndarray
      Vector of RDKit descriptors for `mol`
    """
    rval = []
    for desc_name, function in self.descList:
+48 −18
Original line number Diff line number Diff line
@@ -3,21 +3,27 @@ Generate coulomb matrices for molecules.

See Montavon et al., _New Journal of Physics_ __15__ (2013) 095003.
"""
__author__ = "Steven Kearnes"
__copyright__ = "Copyright 2014, Stanford University"
__license__ = "MIT"

import numpy as np
import deepchem as dc
from deepchem.feat import Featurizer
from deepchem.feat.base_classes import MolecularFeaturizer
from deepchem.utils import pad_array
from deepchem.feat.atomic_coordinates import AtomicCoordinates


class BPSymmetryFunctionInput(Featurizer):
  """
  Calculate Symmetry Function for each atom in the molecules
  Methods described in https://journals.aps.org/prl/pdf/10.1103/PhysRevLett.98.146401
class BPSymmetryFunctionInput(MolecularFeaturizer):
  """Calculate Symmetry Function for each atom in the molecules

  This method is described in [1]_ 

  References
  ----------
  .. [1] Behler, Jörg, and Michele Parrinello. "Generalized neural-network
         representation of high-dimensional potential-energy surfaces." Physical
         review letters 98.14 (2007): 146401.

  Note
  ----
  This class requires RDKit to be installed.
  """

  def __init__(self, max_atoms):
@@ -34,9 +40,11 @@ class BPSymmetryFunctionInput(Featurizer):
    return np.pad(features, ((0, self.max_atoms - n_atoms), (0, 0)), 'constant')


class CoulombMatrix(Featurizer):
  """
  Calculate Coulomb matrices for molecules.
class CoulombMatrix(MolecularFeaturizer):
  """Calculate Coulomb matrices for molecules.

  Coulomb matrices provide a representation of the electronic structure of a
  molecule. This method is described in [1]_.

  Parameters
  ----------
@@ -55,14 +63,24 @@ class CoulombMatrix(Featurizer):
  seed : int, optional
      Random seed.

  Example:

  Example
  -------
  >>> featurizers = dc.feat.CoulombMatrix(max_atoms=23)
  >>> input_file = 'deepchem/feat/tests/data/water.sdf' # really backed by water.sdf.csv
  >>> tasks = ["atomization_energy"]
  >>> loader = dc.data.SDFLoader(tasks, featurizer=featurizers)
  >>> dataset = loader.create_dataset(input_file) #doctest: +ELLIPSIS
  Reading structures from deepchem/feat/tests/data/water.sdf.

  References
  ----------
  .. [1] Montavon, Grégoire, et al. "Learning invariant representations of
         molecules for atomization energy prediction." Advances in neural information
         processing systems. 2012.

  Note
  ----
  This class requires RDKit to be installed.
  """
  conformers = True
  name = 'coulomb_matrix'
@@ -74,6 +92,10 @@ class CoulombMatrix(Featurizer):
               upper_tri=False,
               n_samples=1,
               seed=None):
    try:
      from rdkit import Chem
    except ModuleNotFoundError:
      raise ValueError("This class requires RDKit to be installed.")
    self.max_atoms = int(max_atoms)
    self.remove_hydrogens = remove_hydrogens
    self.randomize = randomize
@@ -196,8 +218,10 @@ class CoulombMatrix(Featurizer):


class CoulombMatrixEig(CoulombMatrix):
  """
  Calculate the eigenvales of Coulomb matrices for molecules.
  """Calculate the eigenvalues of Coulomb matrices for molecules.

  This featurizer computes the eigenvalues of the Coulomb matrices for provided
  molecules. Coulomb matrices are described in [1]_.

  Parameters
  ----------
@@ -214,14 +238,20 @@ class CoulombMatrixEig(CoulombMatrix):
  seed : int, optional
      Random seed.

  Example:

  Example
  -------
  >>> featurizers = dc.feat.CoulombMatrixEig(max_atoms=23)
  >>> input_file = 'deepchem/feat/tests/data/water.sdf' # really backed by water.sdf.csv
  >>> tasks = ["atomization_energy"]
  >>> loader = dc.data.SDFLoader(tasks, featurizer=featurizers)
  >>> dataset = loader.create_dataset(input_file) #doctest: +ELLIPSIS
  Reading structures from deepchem/feat/tests/data/water.sdf.

  References
  ----------
  .. [1] Montavon, Grégoire, et al. "Learning invariant representations of
         molecules for atomization energy prediction." Advances in neural information
         processing systems. 2012.
  """

  conformers = True
Loading