Commit 57c45b81 authored by yurievnamaria's avatar yurievnamaria
Browse files

Added example and corrected description for ConvMolFeaturizer(), fixed...

Added example and corrected description for ConvMolFeaturizer(), fixed different sizes of x,w,y,ids in FlatteningTransformer+added tests
parent d09a41f4
Loading
Loading
Loading
Loading
+10 −4
Original line number Diff line number Diff line
@@ -632,6 +632,10 @@ class ConvMolFeaturizer(MolecularFeaturizer):

  Examples
  ---------
  >>> import deepchem as dc
  >>> smiles = ["C", "CCC"]
  >>> featurizer=dc.feat.ConvMolFeaturizer(per_atom_fragmentation=False)
  >>> f = featurizer.featurize(smiles)
  Using ConvMolFeaturizer to create featurized fragments derived from molecules of interest.
  This is used only in the context of performing interpretation of models using atomic
  contributions (atom-based model interpretation)
@@ -688,10 +692,12 @@ class ConvMolFeaturizer(MolecularFeaturizer):
      name of the molecule level property in mol where the solvent
      accessible surface area of atom 0 would be stored.
    per_atom_fragmentation: Boolean
      If True, then multiple "atom-depleted" featurizations will be possible to do for each molecule. It will be
      possible to remove atoms  one by one, and then, featurize each atom-depleted molecule.
      Thus, applying featurize method  will produce a set of ConvMol objects for each molecule. This is useful for
      subsequent model interpretation: finding atoms favorable/unfavorable for (modelled) activity.
      If True, then multiple "atom-depleted" versions of each molecule will be created (using featurize() method). 
      For each molecule, atoms are removed one at a time and the resulting molecule is featurized. 
      The result is a list of ConvMol objects,
      one with each heavy atom removed. This is useful for subsequent model interpretation: finding atoms
      favorable/unfavorable for (modelled) activity. This option is typically used in combination
      with a FlatteningTransformer to split the lists into separate samples.

    Since ConvMol is an object and not a numpy array, need to set dtype to
    object.
+37 −2
Original line number Diff line number Diff line
import tempfile
import os
import deepchem as dc
import numpy as np


def test_flattening_with_csv_load():
def test_flattening_with_csv_load_withtask():
  fin = tempfile.NamedTemporaryFile(mode='w', delete=False)
  fin.write("smiles,endpoint\nc1ccccc1,1")
  fin.close()
  loader = dc.data.CSVLoader(
      ["endpoint"],
      feature_field="smiles",
      featurizer=dc.feat.ConvMolFeaturizer(per_atom_fragmentation=True))
  frag_dataset = loader.create_dataset(fin.name)
  transformer = dc.trans.FlatteningTransformer(dataset=frag_dataset)
  frag_dataset = transformer.transform(frag_dataset)
  assert len(frag_dataset) == 6
  assert np.shape(frag_dataset.y) == (6,
                                      1)  # y should be expanded up to X shape
  assert np.shape(frag_dataset.w) == (6,
                                      1)  # w should be expanded up to X shape


def test_flattening_with_csv_load_notask():
  fin = tempfile.NamedTemporaryFile(mode='w', delete=False)
  fin.write("smiles,endpoint\nc1ccccc1,1")
  fin.close()
@@ -17,7 +36,23 @@ def test_flattening_with_csv_load():
  assert len(frag_dataset) == 6


def test_flattening_with_sdf_load():
def test_flattening_with_sdf_load_withtask():
  cur_dir = os.path.dirname(os.path.realpath(__file__))
  featurizer = dc.feat.ConvMolFeaturizer(per_atom_fragmentation=True)
  loader = dc.data.SDFLoader(
      ["LogP(RRCK)"], featurizer=featurizer, sanitize=True)
  dataset = loader.create_dataset(
      os.path.join(cur_dir, "membrane_permeability.sdf"))
  transformer = dc.trans.FlatteningTransformer(dataset=dataset)
  frag_dataset = transformer.transform(dataset)
  assert len(frag_dataset) == 98
  assert np.shape(frag_dataset.y) == (98,
                                      1)  # y should be expanded up to X shape
  assert np.shape(frag_dataset.w) == (98,
                                      1)  # w should be expanded up to X shape


def test_flattening_with_sdf_load_notask():
  cur_dir = os.path.dirname(os.path.realpath(__file__))
  featurizer = dc.feat.ConvMolFeaturizer(per_atom_fragmentation=True)
  loader = dc.data.SDFLoader([], featurizer=featurizer, sanitize=True)
+25 −3
Original line number Diff line number Diff line
@@ -1037,8 +1037,23 @@ class FlatteningTransformer(Transformer):
  """

  def __init__(self, dataset: Dataset):
    super(FlatteningTransformer, self).__init__(
        transform_X=True, transform_ids=True, dataset=dataset)
    """Initializes flattening transformation.

    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset object to be transformed
    """
    if self.__class__.__name__ == "Transformer":
      raise ValueError(
          "Transformer is an abstract superclass and cannot be directly instantiated. You probably want to instantiate a concrete subclass instead."
      )
    self.transform_X = True
    self.transform_y = (
        dataset.get_shape()[1] != tuple())  # iff y passed, then transform it
    self.transform_w = (
        dataset.get_shape()[2] != tuple())  # iff w passed, then transform it
    self.transform_ids = True

  def transform_array(
      self, X: np.ndarray, y: np.ndarray, w: np.ndarray,
@@ -1067,10 +1082,17 @@ class FlatteningTransformer(Transformer):
    idstrans: np.ndarray
      Transformed array of ids
    """

    ids = np.repeat(
        ids, [len(i)
              for i in X], axis=0)  # each fragment should recieve parent mol id
    if self.transform_y:
      y = np.repeat(
          y, [len(i) for i in X], axis=0
      )  # for consistency of shapes each fragment should recieve parent mol y
    if self.transform_w:
      w = np.repeat(
          w, [len(i) for i in X], axis=0
      )  # for consistency of shapes each fragment should recieve parent mol w
    X = np.array([j for i in X for j in i])  # flatten
    return (X, y, w, ids)