Commit d7a99002 authored by nd-02110114's avatar nd-02110114
Browse files

♻️ refactor base featurizer classes

parent c94d9653
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -109,7 +109,7 @@ class Docker(object):
    if self.scoring_model is not None:
      for posed_complex in complexes:
        # TODO: How to handle the failure here?
        features, _ = self.featurizer.featurize_complexes([molecular_complex])
        features, _ = self.featurizer.featurize([molecular_complex])
        dataset = NumpyDataset(X=features)
        score = self.scoring_model.predict(dataset)
        yield (posed_complex, score)
+1 −1
Original line number Diff line number Diff line
@@ -105,7 +105,7 @@ class TestDocking(unittest.TestCase):

    class DummyFeaturizer(ComplexFeaturizer):

      def featurize_complexes(self, complexes, *args, **kwargs):
      def featurize(self, complexes, *args, **kwargs):
        return np.zeros((len(complexes), 5)), None

    class DummyModel(Model):
+2 −3
Original line number Diff line number Diff line
@@ -3,7 +3,6 @@ Atomic coordinate featurizer.
"""
import logging
import numpy as np
from deepchem.utils.save import log
from deepchem.feat import Featurizer
from deepchem.feat import ComplexFeaturizer
from deepchem.utils import rdkit_util, pad_array
@@ -162,7 +161,7 @@ class NeighborListComplexAtomicCoordinates(ComplexFeaturizer):
    self.dtype = object
    self.coordinates_featurizer = AtomicCoordinates()

  def _featurize_complex(self, mol_pdb_file, protein_pdb_file):
  def _featurize(self, mol_pdb_file, protein_pdb_file):
    """
    Compute neighbor list for complex.

@@ -218,7 +217,7 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):
    self.neighborlist_featurizer = NeighborListComplexAtomicCoordinates(
        self.max_num_neighbors, self.neighbor_cutoff)

  def _featurize_complex(self, mol_pdb_file, protein_pdb_file):
  def _featurize(self, mol_pdb_file, protein_pdb_file):
    try:
      frag1_coords, frag1_mol = rdkit_util.load_molecule(
          mol_pdb_file, is_protein=False, sanitize=True, add_hydrogens=False)
+12 −88
Original line number Diff line number Diff line
@@ -12,11 +12,6 @@ logger = logging.getLogger(__name__)
JSON = Dict[str, Any]


def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message):
  logging.info(log_message)
  return featurizer._featurize_complex(mol_pdb_file, protein_pdb_file)


class Featurizer(object):
  """Abstract class for calculating a set of features for a datapoint.

@@ -57,24 +52,23 @@ class Featurizer(object):
    features = np.asarray(features)
    return features

  def __call__(self, datapoints):
    """Calculate features for datapoints.
  def _featurize(self, datapoint):
    """Calculate features for a single datapoint.

    Parameters
    ----------
    datapoints: object 
       Any blob of data you like. Subclasss should instantiate
       this. 
    datapoint: object
      a single datapoint in a sequence of objects
    """
    return self.featurize(datapoints)
    raise NotImplementedError('Featurizer is not defined.')


class ComplexFeaturizer(object):
class ComplexFeaturizer(Featurizer):
  """"
  Abstract class for calculating features for mol/protein complexes.
  """

  def featurize_complexes(self, mol_files, protein_pdbs):
  def featurize(self, mol_files, protein_pdbs):
    """
    Calculate features for mol/protein complexes.

@@ -97,7 +91,7 @@ class ComplexFeaturizer(object):
    for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_pdbs)):
      log_message = "Featurizing %d / %d" % (i, len(mol_files))
      results.append(
          pool.apply_async(_featurize_complex,
          pool.apply_async(self._featurize,
                           (self, mol_file, protein_pdb, log_message)))
    pool.close()
    features = []
@@ -112,7 +106,7 @@ class ComplexFeaturizer(object):
    features = np.asarray(features)
    return features, failures

  def _featurize_complex(self, mol_pdb, complex_pdb):
  def _featurize(self, mol_pdb, complex_pdb):
    """
    Calculate features for single mol/protein complex.

@@ -187,28 +181,6 @@ class MolecularFeaturizer(Featurizer):
    features = np.asarray(features)
    return features

  def _featurize(self, mol):
    """
    Calculate features for a single molecule.

    Parameters
    ----------
    mol : RDKit Mol
        Molecule.
    """
    raise NotImplementedError('Featurizer is not defined.')

  def __call__(self, molecules):
    """
    Calculate features for molecules.

    Parameters
    ----------
    molecules: iterable
        An iterable yielding RDKit Mol objects or SMILES strings.
    """
    return self.featurize(molecules)


class StructureFeaturizer(Featurizer):
  """
@@ -282,30 +254,6 @@ class StructureFeaturizer(Featurizer):
    features = np.asarray(features)
    return features

  def _featurize(self, structure: "pymatgen.Structure"):
    """Calculate features for a single crystal structure.

    Parameters
    ----------
    structure: pymatgen.Structure object
      Structure object with 3D coordinates and periodic lattice.

    """

    raise NotImplementedError('Featurizer is not defined.')

  def __call__(self, structures: Iterable[dict]):
    """Calculate features for crystal structures.

    Parameters
    ----------
    structures: Iterable[dict]
      An iterable of crystal structure dictionaries.

    """

    return self.featurize(structures)


class CompositionFeaturizer(Featurizer):
  """
@@ -377,30 +325,6 @@ class CompositionFeaturizer(Featurizer):
    features = np.asarray(features)
    return features

  def _featurize(self, composition: "pymatgen.Composition"):
    """Calculate features for a single crystal composition.

    Parameters
    ----------
    composition: pymatgen.Composition object
      Composition object for 3D inorganic crystal.

    """

    raise NotImplementedError('Featurizer is not defined.')

  def __call__(self, compositions: Iterable[str]):
    """Calculate features for crystal compositions.

    Parameters
    ----------
    compositions: Iterable[str]
      An iterable of crystal compositions.

    """

    return self.featurize(compositions)


class UserDefinedFeaturizer(Featurizer):
  """Directs usage of user-computed featurizations."""
+2 −9
Original line number Diff line number Diff line
import enum
import numpy as np
import deepchem as dc
from deepchem.feat.base_classes import MolecularFeaturizer
from deepchem.feat.atomic_coordinates import ComplexNeighborListFragmentAtomicCoordinates
from deepchem.feat.mol_graphs import ConvMol, WeaveMol
from deepchem.data import DiskDataset
import multiprocessing
import logging


def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message):
  logging.info(log_message)
  return featurizer._featurize_complex(mol_pdb_file, protein_pdb_file)


def one_of_k_encoding(x, allowable_set):
  """Encodes elements of a provided set as integers.

@@ -815,12 +808,12 @@ class AtomicConvFeaturizer(ComplexNeighborListFragmentAtomicCoordinates):
    self.epochs = epochs
    self.labels = labels

  def featurize_complexes(self, mol_files, protein_files):
  def featurize(self, mol_files, protein_files):
    features = []
    failures = []
    for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_files)):
      logging.info("Featurizing %d / %d" % (i, len(mol_files)))
      new_features = self._featurize_complex(mol_file, protein_pdb)
      new_features = self._featurize(mol_file, protein_pdb)
      # Handle loading failures which return None
      if new_features is not None:
        features.append(new_features)
Loading