Unverified Commit ab911353 authored by Daiki Nishikawa's avatar Daiki Nishikawa Committed by GitHub
Browse files

Merge pull request #2090 from nd-02110114/fix-base-classes-feat

Add canonicalize option to MolecularFeaturizer
parents 9c879e45 c94fa0b1
Loading
Loading
Loading
Loading
+10 −10
Original line number Diff line number Diff line
@@ -88,8 +88,8 @@ class VinaPoseGenerator(PoseGenerator):
  is an environment variable you set) and invokes the executable
  to perform pose generation for you.

  Notes
  -----
  Note
  ----
  This class requires RDKit to be installed.
  """

+47 −68
Original line number Diff line number Diff line
@@ -2,10 +2,9 @@
Feature calculations.
"""
import logging
import types
import numpy as np
import multiprocessing
from typing import Any, Dict, List, Iterable, Sequence, Tuple, Union
from typing import Any, Dict, List, Iterable, Sequence, Tuple

logger = logging.getLogger(__name__)

@@ -36,6 +35,7 @@ class Featurizer(object):

    Returns
    -------
    np.ndarray
      A numpy array containing a featurized representation of `datapoints`.
    """
    datapoints = list(datapoints)
@@ -74,24 +74,6 @@ class Featurizer(object):
    raise NotImplementedError('Featurizer is not defined.')


def _featurize_callback(
    featurizer,
    mol_pdb_file,
    protein_pdb_file,
    log_message,
):
  """Callback function for apply_async in ComplexFeaturizer.

  This callback function must be defined globally
  because `apply_async` doesn't execute a nested function.

  See the details from the following link.
  https://stackoverflow.com/questions/56533827/pool-apply-async-nested-function-is-not-executed
  """
  logging.info(log_message)
  return featurizer._featurize(mol_pdb_file, protein_pdb_file)


class ComplexFeaturizer(object):
  """"
  Abstract class for calculating features for mol/protein complexes.
@@ -122,7 +104,7 @@ class ComplexFeaturizer(object):
    for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_pdbs)):
      log_message = "Featurizing %d / %d" % (i, len(mol_files))
      results.append(
          pool.apply_async(_featurize_callback,
          pool.apply_async(ComplexFeaturizer._featurize_callback,
                           (self, mol_file, protein_pdb, log_message)))
    pool.close()
    features = []
@@ -143,30 +125,36 @@ class ComplexFeaturizer(object):

    Parameters
    ----------
    mol_pdb: list
      Should be a list of lines of the PDB file.
    complex_pdb: list
      Should be a list of lines of the PDB file.
    mol_pdb : str
      The PDB filename.
    complex_pdb : str
      The PDB filename.
    """
    raise NotImplementedError('Featurizer is not defined.')

  @staticmethod
  def _featurize_callback(featurizer, mol_pdb_file, protein_pdb_file,
                          log_message):
    logging.info(log_message)
    return featurizer._featurize(mol_pdb_file, protein_pdb_file)


class MolecularFeaturizer(Featurizer):
  """Abstract class for calculating a set of features for a
  molecule.

  The defining feature of a `MolecularFeaturizer` is that it
  uses SMILES strings and RDKIT molecule objects to represent
  uses SMILES strings and RDKit molecule objects to represent
  small molecules. All other featurizers which are subclasses of
  this class should plan to process input which comes as smiles
  strings or RDKIT molecules. 
  strings or RDKit molecules.

  Child classes need to implement the _featurize method for
  calculating features for a single molecule.

  Note
  ----
  In general, subclasses of this class will require RDKit to be installed.
  Notes
  -----
  The subclasses of this class require RDKit to be installed.
  """

  def featurize(self, molecules, log_every_n=1000):
@@ -174,14 +162,16 @@ class MolecularFeaturizer(Featurizer):

    Parameters
    ----------
    molecules: RDKit Mol / SMILES string / iterable
    molecules: rdkit.Chem.rdchem.Mol / SMILES string / iterable
      RDKit Mol, or SMILES string or iterable sequence of RDKit mols/SMILES
      strings.
    log_every_n: int, default 1000
      Logging messages reported every `log_every_n` samples.

    Returns
    -------
    A numpy array containing a featurized representation of
    `datapoints`.
    features: np.ndarray
      A numpy array containing a featurized representation of `datapoints`.
    """
    try:
      from rdkit import Chem
@@ -190,26 +180,23 @@ class MolecularFeaturizer(Featurizer):
      from rdkit.Chem.rdchem import Mol
    except ModuleNotFoundError:
      raise ValueError("This class requires RDKit to be installed.")

    # Special case handling of single molecule
    if isinstance(molecules, str) or isinstance(molecules, Mol):
      molecules = [molecules]
    else:
      # Convert iterables to list
      molecules = list(molecules)

    features = []
    for i, mol in enumerate(molecules):
      if i % log_every_n == 0:
        logger.info("Featurizing datapoint %i" % i)
      try:
        # Process only case of SMILES strings.
        if isinstance(mol, str):
          # mol must be a SMILES string so parse
          # mol must be a RDKit Mol object, so parse a SMILES
          mol = Chem.MolFromSmiles(mol)
          # TODO (ytz) this is a bandage solution to reorder the atoms
          # so that they're always in the same canonical order.
          # Presumably this should be correctly implemented in the
          # future for graph mols.
          if mol:
          # SMILES is unique, so set a canonical order of atoms
          new_order = rdmolfiles.CanonicalRankAtoms(mol)
          mol = rdmolops.RenumberAtoms(mol, new_order)
        features.append(self._featurize(mol))
@@ -243,7 +230,6 @@ class MaterialStructureFeaturizer(Featurizer):
  -----
  Some subclasses of this class will require pymatgen and matminer to be
  installed.

  """

  def featurize(self,
@@ -265,16 +251,13 @@ class MaterialStructureFeaturizer(Featurizer):
    features: np.ndarray
      A numpy array containing a featurized representation of
      `structures`.

    """

    structures = list(structures)

    try:
      from pymatgen import Structure
    except ModuleNotFoundError:
      raise ValueError("This class requires pymatgen to be installed.")

    structures = list(structures)
    features = []
    for idx, structure in enumerate(structures):
      if idx % log_every_n == 0:
@@ -312,7 +295,6 @@ class MaterialCompositionFeaturizer(Featurizer):
  -----
  Some subclasses of this class will require pymatgen and matminer to be
  installed.

  """

  def featurize(self, compositions: Iterable[str],
@@ -331,16 +313,13 @@ class MaterialCompositionFeaturizer(Featurizer):
    features: np.ndarray
      A numpy array containing a featurized representation of
      `compositions`.

    """

    compositions = list(compositions)

    try:
      from pymatgen import Composition
    except ModuleNotFoundError:
      raise ValueError("This class requires pymatgen to be installed.")

    compositions = list(compositions)
    features = []
    for idx, composition in enumerate(compositions):
      if idx % log_every_n == 0:
+4 −4
Original line number Diff line number Diff line
@@ -226,10 +226,10 @@ class BatchGraphData(GraphData):

    # create new edge index
    num_nodes_list = [graph.num_nodes for graph in graph_list]
    batch_edge_index = np.hstack(
      [graph.edge_index + prev_num_node for prev_num_node, graph \
        in zip([0] + num_nodes_list[:-1], graph_list)]
    )
    batch_edge_index = np.hstack([
        graph.edge_index + prev_num_node
        for prev_num_node, graph in zip([0] + num_nodes_list[:-1], graph_list)
    ])

    # graph_index indicates which nodes belong to which graph
    graph_index = []
+1 −0
Original line number Diff line number Diff line
"""
Featurizers for inorganic crystals.
"""
# flake8: noqa
from deepchem.feat.material_featurizers.element_property_fingerprint import ElementPropertyFingerprint
from deepchem.feat.material_featurizers.sine_coulomb_matrix import SineCoulombMatrix
from deepchem.feat.material_featurizers.cgcnn_featurizer import CGCNNFeaturizer
+1 −0
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ ignore =
    E129,  # Visually indented line with same indent as next logical line
    W503,  # Line break before binary operator
    W504,  # Line break after binary operator
    E722   # do not use bare 'except'
max-line-length = 300

[yapf]
+8 −8

File changed.

Contains only whitespace changes.

+20 −20

File changed.

Contains only whitespace changes.

+14 −14

File changed.

Contains only whitespace changes.

+17 −17

File changed.

Contains only whitespace changes.

+11 −11

File changed.

Contains only whitespace changes.

+7 −7

File changed.

Contains only whitespace changes.

+5 −5

File changed.

Contains only whitespace changes.

+2 −2

File changed.

Contains only whitespace changes.

+9 −9

File changed.

Contains only whitespace changes.

+9 −9

File changed.

Contains only whitespace changes.

+14 −14

File changed.

Contains only whitespace changes.

+10 −10

File changed.

Contains only whitespace changes.

+7 −7

File changed.

Contains only whitespace changes.

+17 −17

File changed.

Contains only whitespace changes.

+19 −19

File changed.

Contains only whitespace changes.

Loading