Unverified Commit 28d0e19b authored by Daiki Nishikawa's avatar Daiki Nishikawa Committed by GitHub
Browse files

Merge pull request #2136 from nd-02110114/fix-docs-2

Refactor featurizer
parents e65fbe1c 1512d9c3
Loading
Loading
Loading
Loading
+10 −9
Original line number Diff line number Diff line
@@ -13,22 +13,23 @@ from deepchem.feat.base_classes import UserDefinedFeaturizer

from deepchem.feat.graph_features import ConvMolFeaturizer
from deepchem.feat.graph_features import WeaveFeaturizer
from deepchem.feat.fingerprints import CircularFingerprint
from deepchem.feat.rdkit_descriptors import RDKitDescriptors
from deepchem.feat.coulomb_matrices import CoulombMatrix
from deepchem.feat.coulomb_matrices import CoulombMatrixEig
from deepchem.feat.coulomb_matrices import BPSymmetryFunctionInput
from deepchem.feat.rdkit_grid_featurizer import RdkitGridFeaturizer
from deepchem.feat.binding_pocket_features import BindingPocketFeaturizer
from deepchem.feat.one_hot import OneHotFeaturizer
from deepchem.feat.raw_featurizer import RawFeaturizer
from deepchem.feat.atomic_coordinates import AtomicCoordinates
from deepchem.feat.atomic_coordinates import NeighborListComplexAtomicCoordinates
from deepchem.feat.adjacency_fingerprints import AdjacencyFingerprint
from deepchem.feat.smiles_featurizers import SmilesToSeq, SmilesToImage

# molecule featurizers
from deepchem.feat.molecule_featurizers import MolGraphConvFeaturizer
from deepchem.feat.molecule_featurizers import CircularFingerprint
from deepchem.feat.molecule_featurizers import CoulombMatrix
from deepchem.feat.molecule_featurizers import CoulombMatrixEig
from deepchem.feat.molecule_featurizers import MordredDescriptors
from deepchem.feat.molecule_featurizers import Mol2VecFingerprint
from deepchem.feat.molecule_featurizers import OneHotFeaturizer
from deepchem.feat.molecule_featurizers import RawFeaturizer
from deepchem.feat.molecule_featurizers import RDKitDescriptors
from deepchem.feat.molecule_featurizers import SmilesToImage
from deepchem.feat.molecule_featurizers import SmilesToSeq, create_char_to_idx

# material featurizers
from deepchem.feat.material_featurizers import ElementPropertyFingerprint
+5 −6
Original line number Diff line number Diff line
@@ -133,7 +133,6 @@ class NeighborListAtomicCoordinates(Featurizer):
      mol: rdkit Mol
        To be featurized.
    """
    N = mol.GetNumAtoms()
    # TODO(rbharath): Should this return a list?
    bohr_coords = self.coordinates_featurizer._featurize(mol)[0]
    coords = get_coords(mol)
@@ -168,9 +167,9 @@ class NeighborListComplexAtomicCoordinates(ComplexFeaturizer):

    Parameters
    ----------
    mol_pdb_file: Str 
    mol_pdb_file: str
      Filename for ligand pdb file.
    protein_pdb_file: Str 
    protein_pdb_file: str
      Filename for protein pdb file.
    """
    mol_coords, ob_mol = load_molecule(mol_pdb_file)
@@ -245,7 +244,7 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):

      system_coords, system_neighbor_list, system_z = self.featurize_mol(
          system_coords, system_mol, self.complex_num_atoms)
    except ValueError as e:
    except ValueError:
      logging.warning(
          "max_atoms was set too low. Some complexes too large and skipped")
      return None
+19 −1
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@ import numpy as np
import multiprocessing
from typing import Any, Dict, List, Iterable, Sequence, Tuple, Union

from deepchem.utils import get_print_threshold
from deepchem.utils.typing import PymatgenStructure

logger = logging.getLogger(__name__)
@@ -96,7 +97,15 @@ class Featurizer(object):
    args_names = [arg for arg in args_spec.args if arg != 'self']
    args_info = ''
    for arg_name in args_names:
      args_info += arg_name + '=' + str(self.__dict__[arg_name]) + ', '
      value = self.__dict__[arg_name]
      # for str
      if isinstance(value, str):
        value = "'" + value + "'"
      # for list
      if isinstance(value, list):
        threshold = get_print_threshold()
        value = np.array2string(np.array(value), threshold=threshold)
      args_info += arg_name + '=' + str(value) + ', '
    return self.__class__.__name__ + '[' + args_info[:-2] + ']'

  def __str__(self) -> str:
@@ -126,6 +135,15 @@ class Featurizer(object):
    override_args_info = ''
    for arg_name, default in zip(args_names, args_default_values):
      arg_value = self.__dict__[arg_name]
      # validation
      # skip list
      if isinstance(arg_value, list):
        continue
      if isinstance(arg_value, str):
        # skip path string
        if "\\/." in arg_value or "/" in arg_value or '.' in arg_value:
          continue
      # main logic
      if default != arg_value:
        override_args_info += '_' + arg_name + '_' + str(arg_value)
    return self.__class__.__name__ + override_args_info
+23 −9
Original line number Diff line number Diff line
@@ -3,14 +3,17 @@ Featurizes proposed binding pockets.
"""
import numpy as np
import logging
from typing import Dict, List

from deepchem.feat import Featurizer
from deepchem.utils.coordinate_box_utils import CoordinateBox
from deepchem.utils.rdkit_utils import load_molecule

logger = logging.getLogger(__name__)


def boxes_to_atoms(coords, boxes):
def boxes_to_atoms(coords: np.ndarray, boxes: List[CoordinateBox]
                  ) -> Dict[CoordinateBox, List[int]]:
  """Maps each box to a list of atoms in that box.

  Given the coordinates of a macromolecule, and a collection of boxes,
@@ -20,13 +23,14 @@ def boxes_to_atoms(coords, boxes):
  Parameters
  ----------
  coords: np.ndarray
    Of shape `(N, 3)
    A numpy array of shape `(N, 3)`
  boxes: list
    list of `CoordinateBox` objects.
    List of `CoordinateBox` objects.

  Returns
  -------
  dictionary mapping `CoordinateBox` objects to lists of atom coordinates
  Dict[CoordinateBox, List[int]]
    A dictionary mapping `CoordinateBox` objects to lists of atom indices.
  """
  mapping = {}
  for box_ind, box in enumerate(boxes):
@@ -57,6 +61,10 @@ class BindingPocketFeaturizer(Featurizer):
  implementation for more sophisticated downstream usecases. Note that
  this class's implementation will only work for proteins and not for
  other macromolecules

  Notes
  -----
  This class requires mdtraj to be installed.
  """

  residues = [
@@ -67,7 +75,9 @@ class BindingPocketFeaturizer(Featurizer):

  n_features = len(residues)

  def featurize(self, protein_file, pockets):
  # FIXME: Signature of "featurize" incompatible with supertype "Featurizer"
  def featurize(  # type: ignore[override]
      self, protein_file: str, pockets: List[CoordinateBox]) -> np.ndarray:
    """
    Calculate atomic coodinates.

@@ -75,14 +85,19 @@ class BindingPocketFeaturizer(Featurizer):
    ----------
    protein_file: str
      Location of PDB file. Will be loaded by MDTraj
    pockets: list[CoordinateBox]
    pockets: List[CoordinateBox]
      List of `dc.utils.CoordinateBox` objects.

    Returns
    -------
    np.ndarray
      A numpy array of shale `(len(pockets), n_residues)`
    """
    try:
      import mdtraj
    except ModuleNotFoundError:
      raise ValueError("This class requires RDKit to be installed.")

    protein_coords = load_molecule(
        protein_file, add_hydrogens=False, calc_charges=False)[0]
    mapping = boxes_to_atoms(protein_coords, pockets)
@@ -101,6 +116,5 @@ class BindingPocketFeaturizer(Featurizer):
        if residue not in res_map:
          logger.info("Warning: Non-standard residue in PDB file")
          continue
        atomtype = atom_name.split("-")[1]
        all_features[pocket_num, res_map[residue]] += 1
    return all_features
Loading