Commit ba9cc2fd authored by Nathan Frey's avatar Nathan Frey
Browse files

Refactor for struct and comp featurizers

parent 173ccbdb
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -3,7 +3,8 @@ Making it easy to import in classes.
"""
from deepchem.feat.base_classes import Featurizer
from deepchem.feat.base_classes import MolecularFeaturizer
from deepchem.feat.base_classes import CrystalFeaturizer
from deepchem.feat.base_classes import StructureFeaturizer
from deepchem.feat.base_classes import CompositionFeaturizer
from deepchem.feat.base_classes import ComplexFeaturizer
from deepchem.feat.base_classes import UserDefinedFeaturizer
from deepchem.feat.graph_features import ConvMolFeaturizer
+130 −29
Original line number Diff line number Diff line
@@ -208,17 +208,17 @@ class MolecularFeaturizer(Featurizer):
    return self.featurize(molecules)


class CrystalFeaturizer(Featurizer):
class StructureFeaturizer(Featurizer):
  """
  Abstract class for calculating a set of features for a
  crystal structure.
  Abstract class for calculating a set of features for an
  inorganic crystal structure.

  The defining feature of a `CrystalFeaturizer` is that it
  operates on 3D crystals with periodic boundary conditions. Inorganic
  crystal structures are represented by Pymatgen composition and structure
  The defining feature of a `StructureFeaturizer` is that it
  operates on 3D crystal structures with periodic boundary conditions. 
  Inorganic crystal structures are represented by Pymatgen structure
  objects. Featurizers for inorganic crystal structures that are subclasses of
  this class should plan to process input which comes as composition
  strings or pymatgen structure dictionaries. 
  this class should plan to process input which comes as pymatgen
  structure objects. 

  Child classes need to implement the _featurize method for
  calculating features for a single crystal.
@@ -230,14 +230,16 @@ class CrystalFeaturizer(Featurizer):

  """

  def featurize(self, crystals: Iterable, log_every_n: int = 1000) -> np.ndarray:
    """Calculate features for crystals.
  def featurize(self, structures: Iterable[dict],
                log_every_n: int = 1000) -> np.ndarray:
    """Calculate features for crystal structures.

    Parameters
    ----------
    crystals: Iterable
      Iterable sequence of composition strings, pymatgen structure
      dictionaries, or another crystal representation.
    structures: Iterable[dict]
      Iterable sequence of pymatgen structure dictionaries.
      Json-serializable dictionary representation of pymatgen.core.structure
      https://pymatgen.org/pymatgen.core.structure.html
    log_every_n: int, default 1000
      Logging messages reported every `log_every_n` samples.

@@ -245,23 +247,122 @@ class CrystalFeaturizer(Featurizer):
    -------
    features: np.ndarray
      A numpy array containing a featurized representation of
      `crystals`.
      `structures`.

    """

    # Special case handling of single crystal
    if not isinstance(crystals, Iterable):
      crystals = [crystals]
    # Special case handling of single crystal structure
    if not isinstance(structures, Iterable):
      structures = [structures]
    else:
      # Convert iterables to list
      crystals = list(crystals)
      structures = list(structures)

    try:
      from pymatgen import Structure
    except ModuleNotFoundError:
      raise ValueError("This class requires pymatgen to be installed.")

    features = []
    for idx, structure in enumerate(structures):
      if idx % log_every_n == 0:
        logger.info("Featurizing datapoint %i" % idx)
      try:
        s = Structure.from_dict(structure)
        features.append(self._featurize(s))
      except:
        logger.warning(
            "Failed to featurize datapoint %i. Appending empty array" % idx)
        features.append(np.array([]))

    features = np.asarray(features)
    return features

  def _featurize(self, structure: "pymatgen.Structure"):
    """Calculate features for a single crystal structure.

    Parameters
    ----------
    structure: pymatgen.Structure object
      Structure object with 3D coordinates and periodic lattice.

    """

    raise NotImplementedError('Featurizer is not defined.')

  def __call__(self, structures: Iterable[dict]):
    """Calculate features for crystal structures.

    Parameters
    ----------
    structures: Iterable[dict]
      An iterable of crystal structure dictionaries.

    """

    return self.featurize(structures)


class CompositionFeaturizer(Featurizer):
  """
  Abstract class for calculating a set of features for an
  inorganic crystal composition.

  The defining feature of a `CompositionFeaturizer` is that it
  operates on 3D crystal chemical compositions. 
  Inorganic crystal compositions are represented by Pymatgen composition
  objects. Featurizers for inorganic crystal compositions that are 
  subclasses of this class should plan to process input which comes as
  Pymatgen composition objects. 

  Child classes need to implement the _featurize method for
  calculating features for a single composition.

  Notes
  -----
  Some subclasses of this class will require pymatgen and matminer to be
  installed.

  """

  def featurize(self, compositions: Iterable[str],
                log_every_n: int = 1000) -> np.ndarray:
    """Calculate features for crystal compositions.

    Parameters
    ----------
    compositions: Iterable[str]
      Iterable sequence of composition strings, e.g. "MoS2".
    log_every_n: int, default 1000
      Logging messages reported every `log_every_n` samples.

    Returns
    -------
    features: np.ndarray
      A numpy array containing a featurized representation of
      `compositions`.

    """

    # Special case handling of single crystal composition
    if not isinstance(compositions, Iterable):
      compositions = [compositions]
    else:
      # Convert iterables to list
      compositions = list(compositions)

    try:
      from pymatgen import Composition
    except ModuleNotFoundError:
      raise ValueError("This class requires pymatgen to be installed.")

    features = []
    for idx, crystal in enumerate(crystals):
    for idx, composition in enumerate(compositions):
      if idx % log_every_n == 0:
        logger.info("Featurizing datapoint %i" % idx)
      try:
        features.append(self._featurize(crystal))
        c = Composition(composition)
        features.append(self._featurize(c))
      except:
        logger.warning(
            "Failed to featurize datapoint %i. Appending empty array" % idx)
@@ -270,29 +371,29 @@ class CrystalFeaturizer(Featurizer):
    features = np.asarray(features)
    return features

  def _featurize(self, crystal):
    """Calculate features for a single crystal.
  def _featurize(self, composition: "pymatgen.Composition"):
    """Calculate features for a single crystal composition.

    Parameters
    ----------
    crystal: crystal representation
        Crystal.
    composition: pymatgen.Composition object
      Composition object for 3D inorganic crystal.

    """

    raise NotImplementedError('Featurizer is not defined.')

  def __call__(self, crystals: Iterable):
    """Calculate features for crystals.
  def __call__(self, compositions: Iterable[str]):
    """Calculate features for crystal compositions.

    Parameters
    ----------
    crystals: Iterable
        An iterable of crystal representations.
    compositions: Iterable[str]
      An iterable of crystal compositions.

    """

    return self.featurize(crystals)
    return self.featurize(compositions)


class UserDefinedFeaturizer(Featurizer):
+21 −37
Original line number Diff line number Diff line
@@ -4,11 +4,11 @@ Featurizers for inorganic crystals.

import numpy as np

from deepchem.feat import CrystalFeaturizer
from deepchem.feat import StructureFeaturizer, CompositionFeaturizer
from deepchem.utils import pad_array


class ElementPropertyFingerprint(CrystalFeaturizer):
class ElementPropertyFingerprint(CompositionFeaturizer):
  """
  Fingerprint of elemental properties from composition.

@@ -50,14 +50,14 @@ class ElementPropertyFingerprint(CrystalFeaturizer):

    self.data_source = data_source

  def _featurize(self, comp):
  def _featurize(self, composition: "pymatgen.Composition"):
    """
    Calculate chemical fingerprint from crystal composition.

    Parameters
    ----------
    comp : str
      Reduced formula of crystal.
    composition: pymatgen.Composition object
      Composition object.

    Returns
    -------
@@ -66,27 +66,22 @@ class ElementPropertyFingerprint(CrystalFeaturizer):
      stoichiometry. Some values may be NaN.

    """

    try:
      from pymatgen import Composition
      from matminer.featurizers.composition import ElementProperty
    except ModuleNotFoundError:
      raise ValueError("This class requires pymatgen and matminer to be installed.")

    # Get pymatgen Composition object
    c = Composition(comp)
      raise ValueError("This class requires matminer to be installed.")

    ep = ElementProperty.from_preset(self.data_source)

    try:
      feats = ep.featurize(c)
      feats = ep.featurize(composition)
    except:
      feats = []

    return np.array(feats)


class SineCoulombMatrix(CrystalFeaturizer):
class SineCoulombMatrix(StructureFeaturizer):
  """
  Calculate sine Coulomb matrix for crystals.

@@ -129,15 +124,15 @@ class SineCoulombMatrix(CrystalFeaturizer):
    self.max_atoms = int(max_atoms)
    self.flatten = flatten

  def _featurize(self, struct):
  def _featurize(self, struct: "pymatgen.Structure"):
    """
    Calculate sine Coulomb matrix from pymatgen structure.

    Parameters
    ----------
    struct : dict
      Json-serializable dictionary representation of pymatgen.core.structure
      https://pymatgen.org/pymatgen.core.structure.html
    struct : pymatgen.Structure
      A periodic crystal composed of a lattice and a sequence of atomic
      sites with 3D coordinates and elements.
      
    Returns
    -------
@@ -148,16 +143,13 @@ class SineCoulombMatrix(CrystalFeaturizer):
    """

    try:
      from pymatgen import Structure
      from matminer.featurizers.structure import SineCoulombMatrix as SCM
    except ModuleNotFoundError:
      raise ValueError("This class requires pymatgen and matminer to be installed.")

    s = Structure.from_dict(struct)
      raise ValueError("This class requires matminer to be installed.")

    # Get full N x N SCM
    scm = SCM(flatten=False)
    sine_mat = scm.featurize(s)
    sine_mat = scm.featurize(struct)

    if self.flatten:
      eigs, _ = np.linalg.eig(sine_mat)
@@ -172,7 +164,7 @@ class SineCoulombMatrix(CrystalFeaturizer):
    return features


class StructureGraphFeaturizer(CrystalFeaturizer):
class StructureGraphFeaturizer(StructureFeaturizer):
  """
  Calculate structure graph features for crystals.

@@ -218,9 +210,9 @@ class StructureGraphFeaturizer(CrystalFeaturizer):

    Parameters
    ----------
    struct : dict
      Json-serializable dictionary representation of pymatgen.core.structure
      https://pymatgen.org/pymatgen.core.structure.html
    struct : pymatgen.Structure
      A periodic crystal composed of a lattice and a sequence of atomic
      sites with 3D coordinates and elements.

    Returns
    -------
@@ -230,15 +222,7 @@ class StructureGraphFeaturizer(CrystalFeaturizer):

    """

    try:
      from pymatgen import Structure
    except ModuleNotFoundError:
      raise ValueError("This class requires pymatgen to be installed.")

    # Get pymatgen structure object
    s = Structure.from_dict(struct)

    features = self._get_structure_graph_features(s)
    features = self._get_structure_graph_features(struct)
    features = np.array(features)

    return features
@@ -249,7 +233,7 @@ class StructureGraphFeaturizer(CrystalFeaturizer):

    Parameters
    ----------
    struct : pymatgen.core.structure
    struct : pymatgen.Structure
      A periodic crystal composed of a lattice and a sequence of atomic
      sites with 3D coordinates and elements.

+25 −12
Original line number Diff line number Diff line
@@ -161,23 +161,17 @@ AtomConvFeaturizer
.. autoclass:: deepchem.feat.NeighborListComplexAtomicCoordinates
  :members:

CrystalFeaturizer
-----------------
StructureFeaturizer
-------------------

Crystal Featurizers are those that work with datasets of crystals with
Structure Featurizers are those that work with datasets of crystals with
periodic boundary conditions. For inorganic crystal structures, these
featurizers operate on chemical compositions (e.g. "MoS2"), or on a 
featurizers operate on pymatgen.Structure objects, which include a
lattice and 3D coordinates that specify a periodic crystal structure. 
They should be applied on systems that have periodic boundary conditions.
Crystal featurizers are not designed to work with molecules. 

.. autoclass:: deepchem.feat.CrystalFeaturizer
  :members:

ElementPropertyFingerprint
^^^^^^^^^^^^^^^^^^^^^^^^^^
Structure featurizers are not designed to work with molecules. 

.. autoclass:: deepchem.feat.ElementPropertyFingerprint
.. autoclass:: deepchem.feat.StructureFeaturizer
  :members:

SineCoulombMatrix
@@ -192,6 +186,25 @@ StructureGraphFeaturizer
.. autoclass:: deepchem.feat.StructureGraphFeaturizer
  :members:

CompositionFeaturizer
---------------------

Composition Featurizers are those that work with datasets of crystal
compositions with periodic boundary conditions. 
For inorganic crystal structures, these featurizers operate on chemical
compositions (e.g. "MoS2"). They should be applied on systems that have
periodic boundary conditions. Composition featurizers are not designed 
to work with molecules. 

.. autoclass:: deepchem.feat.CompositionFeaturizer
  :members:

ElementPropertyFingerprint
^^^^^^^^^^^^^^^^^^^^^^^^^^

.. autoclass:: deepchem.feat.ElementPropertyFingerprint
  :members:

BindingPocketFeaturizer
-----------------------