Unverified Commit d941e08f authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #2008 from ncfrey/crystal_featurizer

Crystal featurizer base classes
parents a5091674 aa573bf0
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -3,6 +3,8 @@ Making it easy to import in classes.
"""
from deepchem.feat.base_classes import Featurizer
from deepchem.feat.base_classes import MolecularFeaturizer
from deepchem.feat.base_classes import StructureFeaturizer
from deepchem.feat.base_classes import CompositionFeaturizer
from deepchem.feat.base_classes import ComplexFeaturizer
from deepchem.feat.base_classes import UserDefinedFeaturizer
from deepchem.feat.graph_features import ConvMolFeaturizer
+196 −1
Original line number Diff line number Diff line
@@ -5,9 +5,12 @@ import logging
import types
import numpy as np
import multiprocessing
from typing import Iterable, Union, Dict, Any

logger = logging.getLogger(__name__)

JSON = Dict[str, Any]


def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message):
  logging.info(log_message)
@@ -165,7 +168,7 @@ class MolecularFeaturizer(Featurizer):
      molecules = [molecules]
    else:
      # Convert iterables to list
      molecutes = list(molecules)
      molecules = list(molecules)
    features = []
    for i, mol in enumerate(molecules):
      if i % log_every_n == 0:
@@ -207,6 +210,198 @@ class MolecularFeaturizer(Featurizer):
    return self.featurize(molecules)


class StructureFeaturizer(Featurizer):
  """
  Abstract class for calculating a set of features for an
  inorganic crystal structure.

  The defining feature of a `StructureFeaturizer` is that it
  operates on 3D crystal structures with periodic boundary conditions. 
  Inorganic crystal structures are represented by Pymatgen structure
  objects. Featurizers for inorganic crystal structures that are subclasses of
  this class should plan to process input which comes as pymatgen
  structure objects. 

  This class is abstract and cannot be invoked directly. You'll
  likely only interact with this class if you're a developer. Child 
  classes need to implement the _featurize method for calculating 
  features for a single crystal structure.

  Notes
  -----
  Some subclasses of this class will require pymatgen and matminer to be
  installed.

  """

  def featurize(self, structures: Iterable[JSON],
                log_every_n: int = 1000) -> np.ndarray:
    """Calculate features for crystal structures.

    Parameters
    ----------
    structures: Iterable[JSON]
      Iterable sequence of pymatgen structure dictionaries.
      Json-serializable dictionary representation of pymatgen.core.structure
      https://pymatgen.org/pymatgen.core.structure.html
    log_every_n: int, default 1000
      Logging messages reported every `log_every_n` samples.

    Returns
    -------
    features: np.ndarray
      A numpy array containing a featurized representation of
      `structures`.

    """

    # Special case handling of single crystal structure
    if not isinstance(structures, Iterable):
      structures = [structures]
    else:
      # Convert iterables to list
      structures = list(structures)

    try:
      from pymatgen import Structure
    except ModuleNotFoundError:
      raise ValueError("This class requires pymatgen to be installed.")

    features = []
    for idx, structure in enumerate(structures):
      if idx % log_every_n == 0:
        logger.info("Featurizing datapoint %i" % idx)
      try:
        s = Structure.from_dict(structure)
        features.append(self._featurize(s))
      except:
        logger.warning(
            "Failed to featurize datapoint %i. Appending empty array" % idx)
        features.append(np.array([]))

    features = np.asarray(features)
    return features

  def _featurize(self, structure: "pymatgen.Structure"):
    """Calculate features for a single crystal structure.

    Parameters
    ----------
    structure: pymatgen.Structure object
      Structure object with 3D coordinates and periodic lattice.

    """

    raise NotImplementedError('Featurizer is not defined.')

  def __call__(self, structures: Iterable[dict]):
    """Calculate features for crystal structures.

    Parameters
    ----------
    structures: Iterable[dict]
      An iterable of crystal structure dictionaries.

    """

    return self.featurize(structures)


class CompositionFeaturizer(Featurizer):
  """
  Abstract class for calculating a set of features for an
  inorganic crystal composition.

  The defining feature of a `CompositionFeaturizer` is that it
  operates on 3D crystal chemical compositions. 
  Inorganic crystal compositions are represented by Pymatgen composition
  objects. Featurizers for inorganic crystal compositions that are 
  subclasses of this class should plan to process input which comes as
  Pymatgen composition objects. 

  This class is abstract and cannot be invoked directly. You'll
  likely only interact with this class if you're a developer. Child 
  classes need to implement the _featurize method for calculating 
  features for a single crystal composition.

  Notes
  -----
  Some subclasses of this class will require pymatgen and matminer to be
  installed.

  """

  def featurize(self, compositions: Iterable[str],
                log_every_n: int = 1000) -> np.ndarray:
    """Calculate features for crystal compositions.

    Parameters
    ----------
    compositions: Iterable[str]
      Iterable sequence of composition strings, e.g. "MoS2".
    log_every_n: int, default 1000
      Logging messages reported every `log_every_n` samples.

    Returns
    -------
    features: np.ndarray
      A numpy array containing a featurized representation of
      `compositions`.

    """

    # Special case handling of single crystal composition
    if not isinstance(compositions, Iterable):
      compositions = [compositions]
    else:
      # Convert iterables to list
      compositions = list(compositions)

    try:
      from pymatgen import Composition
    except ModuleNotFoundError:
      raise ValueError("This class requires pymatgen to be installed.")

    features = []
    for idx, composition in enumerate(compositions):
      if idx % log_every_n == 0:
        logger.info("Featurizing datapoint %i" % idx)
      try:
        c = Composition(composition)
        features.append(self._featurize(c))
      except:
        logger.warning(
            "Failed to featurize datapoint %i. Appending empty array" % idx)
        features.append(np.array([]))

    features = np.asarray(features)
    return features

  def _featurize(self, composition: "pymatgen.Composition"):
    """Calculate features for a single crystal composition.

    Parameters
    ----------
    composition: pymatgen.Composition object
      Composition object for 3D inorganic crystal.

    """

    raise NotImplementedError('Featurizer is not defined.')

  def __call__(self, compositions: Iterable[str]):
    """Calculate features for crystal compositions.

    Parameters
    ----------
    compositions: Iterable[str]
      An iterable of crystal compositions.

    """

    return self.featurize(compositions)


class UserDefinedFeaturizer(Featurizer):
  """Directs usage of user-computed featurizations."""

+27 −34
Original line number Diff line number Diff line
@@ -4,11 +4,11 @@ Featurizers for inorganic crystals.

import numpy as np

from deepchem.feat import Featurizer
from deepchem.feat import StructureFeaturizer, CompositionFeaturizer
from deepchem.utils import pad_array


class ElementPropertyFingerprint(Featurizer):
class ElementPropertyFingerprint(CompositionFeaturizer):
  """
  Fingerprint of elemental properties from composition.

@@ -50,14 +50,14 @@ class ElementPropertyFingerprint(Featurizer):

    self.data_source = data_source

  def _featurize(self, comp):
  def _featurize(self, composition: "pymatgen.Composition"):
    """
    Calculate chemical fingerprint from crystal composition.

    Parameters
    ----------
    comp : str
      Reduced formula of crystal.
    composition: pymatgen.Composition object
      Composition object.

    Returns
    -------
@@ -66,24 +66,22 @@ class ElementPropertyFingerprint(Featurizer):
      stoichiometry. Some values may be NaN.

    """

    from pymatgen import Composition
    try:
      from matminer.featurizers.composition import ElementProperty

    # Get pymatgen Composition object
    c = Composition(comp)
    except ModuleNotFoundError:
      raise ValueError("This class requires matminer to be installed.")

    ep = ElementProperty.from_preset(self.data_source)

    try:
      feats = ep.featurize(c)
      feats = ep.featurize(composition)
    except:
      feats = []

    return np.array(feats)


class SineCoulombMatrix(Featurizer):
class SineCoulombMatrix(StructureFeaturizer):
  """
  Calculate sine Coulomb matrix for crystals.

@@ -126,15 +124,15 @@ class SineCoulombMatrix(Featurizer):
    self.max_atoms = int(max_atoms)
    self.flatten = flatten

  def _featurize(self, struct):
  def _featurize(self, struct: "pymatgen.Structure"):
    """
    Calculate sine Coulomb matrix from pymatgen structure.

    Parameters
    ----------
    struct : dict
      Json-serializable dictionary representation of pymatgen.core.structure
      https://pymatgen.org/pymatgen.core.structure.html
    struct : pymatgen.Structure
      A periodic crystal composed of a lattice and a sequence of atomic
      sites with 3D coordinates and elements.
      
    Returns
    -------
@@ -144,14 +142,14 @@ class SineCoulombMatrix(Featurizer):

    """

    from pymatgen import Structure
    try:
      from matminer.featurizers.structure import SineCoulombMatrix as SCM

    s = Structure.from_dict(struct)
    except ModuleNotFoundError:
      raise ValueError("This class requires matminer to be installed.")

    # Get full N x N SCM
    scm = SCM(flatten=False)
    sine_mat = scm.featurize(s)
    sine_mat = scm.featurize(struct)

    if self.flatten:
      eigs, _ = np.linalg.eig(sine_mat)
@@ -166,7 +164,7 @@ class SineCoulombMatrix(Featurizer):
    return features


class StructureGraphFeaturizer(Featurizer):
class StructureGraphFeaturizer(StructureFeaturizer):
  """
  Calculate structure graph features for crystals.

@@ -212,9 +210,9 @@ class StructureGraphFeaturizer(Featurizer):

    Parameters
    ----------
    struct : dict
      Json-serializable dictionary representation of pymatgen.core.structure
      https://pymatgen.org/pymatgen.core.structure.html
    struct : pymatgen.Structure
      A periodic crystal composed of a lattice and a sequence of atomic
      sites with 3D coordinates and elements.

    Returns
    -------
@@ -224,12 +222,7 @@ class StructureGraphFeaturizer(Featurizer):

    """

    from pymatgen import Structure

    # Get pymatgen structure object
    s = Structure.from_dict(struct)

    features = self._get_structure_graph_features(s)
    features = self._get_structure_graph_features(struct)
    features = np.array(features)

    return features
@@ -240,7 +233,7 @@ class StructureGraphFeaturizer(Featurizer):

    Parameters
    ----------
    struct : pymatgen.core.structure
    struct : pymatgen.Structure
      A periodic crystal composed of a lattice and a sequence of atomic
      sites with 3D coordinates and elements.

+28 −11
Original line number Diff line number Diff line
@@ -161,19 +161,17 @@ AtomConvFeaturizer
.. autoclass:: deepchem.feat.NeighborListComplexAtomicCoordinates
  :members:

MaterialsFeaturizers
--------------------

Materials Featurizers are those that work with datasets of inorganic crystals.
These featurizers operate on chemical compositions (e.g. "MoS2"), or on a
lattice and 3D coordinates that specify a periodic crystal structure. They
should be applied on systems that have periodic boundary conditions. Materials
featurizers are not designed to work with molecules. 
StructureFeaturizer
-------------------

ElementPropertyFingerprint
^^^^^^^^^^^^^^^^^^^^^^^^^^
Structure Featurizers are those that work with datasets of crystals with
periodic boundary conditions. For inorganic crystal structures, these
featurizers operate on pymatgen.Structure objects, which include a
lattice and 3D coordinates that specify a periodic crystal structure. 
They should be applied on systems that have periodic boundary conditions.
Structure featurizers are not designed to work with molecules. 

.. autoclass:: deepchem.feat.ElementPropertyFingerprint
.. autoclass:: deepchem.feat.StructureFeaturizer
  :members:

SineCoulombMatrix
@@ -188,6 +186,25 @@ StructureGraphFeaturizer
.. autoclass:: deepchem.feat.StructureGraphFeaturizer
  :members:

CompositionFeaturizer
---------------------

Composition Featurizers are those that work with datasets of crystal
compositions with periodic boundary conditions. 
For inorganic crystal structures, these featurizers operate on chemical
compositions (e.g. "MoS2"). They should be applied on systems that have
periodic boundary conditions. Composition featurizers are not designed 
to work with molecules. 

.. autoclass:: deepchem.feat.CompositionFeaturizer
  :members:

ElementPropertyFingerprint
^^^^^^^^^^^^^^^^^^^^^^^^^^

.. autoclass:: deepchem.feat.ElementPropertyFingerprint
  :members:

BindingPocketFeaturizer
-----------------------