Commit a1ec4f3f authored by yurievnamaria's avatar yurievnamaria
Browse files

Added transformer that flattens fragment dataset (FlatteningTransformer)....

Added transformer that flattens fragment dataset (FlatteningTransformer). Added tests for it. (Removed FragmentLoaders & their tests.)
parent c3b0a425
Loading
Loading
Loading
Loading
+0 −3
Original line number Diff line number Diff line
@@ -14,13 +14,10 @@ from deepchem.data.datasets import sparsify_features
from deepchem.data.datasets import densify_features
from deepchem.data.supports import *
from deepchem.data.data_loader import DataLoader
from deepchem.data.data_loader import FragmentLoader
from deepchem.data.data_loader import CSVLoader
from deepchem.data.data_loader import CSVFragmentLoader
from deepchem.data.data_loader import UserCSVLoader
from deepchem.data.data_loader import JsonLoader
from deepchem.data.data_loader import SDFLoader
from deepchem.data.data_loader import SDFFragmentLoader
from deepchem.data.data_loader import FASTALoader
from deepchem.data.data_loader import ImageLoader
from deepchem.data.data_loader import InMemoryLoader
+0 −150
Original line number Diff line number Diff line
@@ -401,136 +401,6 @@ class CSVLoader(DataLoader):
    return np.array(features), valid_inds


class FragmentLoader(DataLoader):
  """The  usecase of `FragmentLoader` and its child classes is to
    load fragment datasets for  model structural interpretation
    (method described in [1]_ ).
    Fragment datasets are loaded into `Dataset` objects.
    Molecules of interest (from sdf/csv files) should serve as the source of fragments,
    subsequently used for prediction by models. Upon prediction
    atoms responsible for the activity modelled can be detected.

    `FragmentLoader` is an abstract subclass of `DataLoader`. This class should
    never be instantiated directly.  To load  dataset of fragments, use
    subclasses (CSVFragmentLoader, SDFFragmentLoader) with
    `ConvMolFeaturizer(per_atom_fragmentation=True)`. Then
    call the `create_dataset()` method on (a list of) input
    file(s) that hold the source data. Under the hood molecules
    will be  fragmented, so that each fragment will represent
    an atom-depleted version of  parent molecule (repeat for each atom). Fragments
    featurized will be returned.

    References
    ---------

    .. [1] Polishchuk, P., et al. J. Chem. Inf. Model. 2016, 56, 8, 1455–1469

    Note
    _________
    Detailed examples of `GraphConvModel` interpretation are provided in Tutorial #28
   """
  def create_dataset(self,
                     inputs: OneOrMany[Any],
                     data_dir: Optional[str] = None,
                     shard_size: Optional[int] = 8192) -> Dataset:
    """Overrides `DataLoader`'s  `create_dataset()` method.
    The only difference from parent is that it "parses"
    fragmented molecules and assigns ids to fragments.
    Creates and returns a `Dataset` object by featurizing provided files.

    Reads in `inputs` and uses `self.featurizer` to featurize the
    data in these inputs.  For large files, automatically shards
    into smaller chunks of `shard_size` datapoints for convenience.
    Returns a `Dataset` object that contains the featurized dataset.

    This implementation assumes that the helper methods `_get_shards`
    and `_featurize_shard` are implemented and that each shard
    returned by `_get_shards` is a pandas dataframe.  You may choose
    to reuse or override this method in your subclass implementations.

    Parameters
    ----------
    inputs: List
      List of inputs to process. Entries can be filenames or arbitrary objects.
    data_dir: str, optional (default None)
      Directory to store featurized dataset.
    shard_size: int, optional (default 8192)
      Number of examples stored in each shard.

    Returns
    -------
    DiskDataset
      A `DiskDataset` object containing a featurized representation of data
      from `inputs`.
    """
    logger.info("Loading raw samples now.")
    logger.info("shard_size: %s" % str(shard_size))

    # Special case handling of single input
    if not isinstance(inputs, list):
      inputs = [inputs]

    def shard_generator():
      for shard_num, shard in enumerate(self._get_shards(inputs, shard_size)):
        time1 = time.time()
        X, valid_inds = self._featurize_shard(shard)
        ids = shard[self.id_field].values
        ids = ids[valid_inds]
        ids = np.repeat(
            ids, [len(i) for i in X],
            axis=0)  # each fragment should recieve parent mol id
        X = np.array([j for i in X for j in i])  # flatten
        if len(self.tasks) > 0:
          warnings.warn(
              "Tasks and weights will be ignored, fragments can't have them")
        # For fragments  where results are unknown, it
        # makes no sense to have y values or weights.
        y, w = (None, None)
        assert len(X) == len(ids)
        time2 = time.time()
        logger.info("TIMING: featurizing shard %d took %0.3f s" %
                    (shard_num, time2 - time1))
        yield X, y, w, ids

    return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)


class CSVFragmentLoader(CSVLoader, FragmentLoader):
  """
  Creates `Dataset` objects from input CSV files, when you are interested in
  fragment dataset (initialize the loader with `ConvMolFeaturizer(per_atom_fragmentation=True)`).

  This class provides exact same functionality as `CSVLoader`, except it uses `create_dataset()`
   method able to handle fragments.

  Examples
  --------
  Let's suppose we have some smiles and labels and we want to fragment these mols.

  >>> smiles = ["C", "CCC"]
  >>> labels = [1.5, 2.3]

  Let's put these in a dataframe.

  >>> import pandas as pd
  >>> df = pd.DataFrame(list(zip(smiles, labels)), columns=["smiles", "task1"])

  Let's now write this to disk somewhere. We can now use `CSVLoader` to
  process this CSV dataset.

  >>> import tempfile
  >>> import deepchem as dc
  >>> with dc.utils.UniversalNamedTemporaryFile(mode='w') as tmpfile:
  ...   df.to_csv(tmpfile.name)
  ...   loader = dc.data.CSVFragmentLoader([], feature_field="smiles",
  ...                              featurizer=dc.feat.ConvMolFeaturizer(per_atom_fragmentation=True))
  ...   dataset = loader.create_dataset(tmpfile.name)
  >>> len(dataset) # equals sum of all fragments from molecules, that is 0 + 3
  3
  """
  pass


class UserCSVLoader(CSVLoader):
  """
  Handles loading of CSV files with user-defined features.
@@ -919,26 +789,6 @@ class SDFLoader(DataLoader):
    return np.array(features), valid_inds


class SDFFragmentLoader(SDFLoader, FragmentLoader):
  """Creates a `Dataset` object from SDF input files, when you are interested in
  fragment dataset (initialize the loader with `ConvMolFeaturizer(per_atom_fragmentation=True)`).

  This class provides exact same functionality as `SDFLoader`, except it uses `create_dataset()`
   method able to handle fragments.

  Examples
  --------
  >>> import deepchem as dc
  >>> import os
  >>> current_dir = os.path.dirname(os.path.realpath(__file__))
  >>> featurizer = dc.feat.ConvMolFeaturizer(per_atom_fragmentation=True)
  >>> loader = dc.data.SDFFragmentLoader([], featurizer=featurizer, sanitize=True)
  >>> dataset = loader.create_dataset(os.path.join(current_dir, "tests", "membrane_permeability.sdf")) # doctest:+ELLIPSIS
  >>> len(dataset) # equals sum of fragments resulting from all molecules
  """
  pass


class FASTALoader(DataLoader):
  """Handles loading of FASTA files.

+0 −12
Original line number Diff line number Diff line
@@ -15,15 +15,3 @@ def test_singleton_csv_fragment_load_with_per_atom_fragmentation():
  X = loader.create_dataset(fin.name)
  assert len(X) == 6
  os.remove(fin.name)


def test_sdf_fragment_load_with_per_atom_fragmentation():
  """Test a case where special form of  dataaset is created from SDF:
    dataset of fragments of molecules  for subsequent model interpretation """
  current_dir = os.path.dirname(os.path.realpath(__file__))
  featurizer = dc.feat.ConvMolFeaturizer(per_atom_fragmentation=True)
  loader = dc.data.SDFFragmentLoader(
      ["LogP(RRCK)"], featurizer=featurizer, sanitize=True)
  dataset = loader.create_dataset(
      os.path.join(current_dir, "membrane_permeability.sdf"))
  assert len(dataset) == 98
+15 −11
Original line number Diff line number Diff line
@@ -7,7 +7,7 @@ from deepchem.feat.complex_featurizers import ComplexNeighborListFragmentAtomicC
from deepchem.feat.mol_graphs import ConvMol, WeaveMol
from deepchem.data import DiskDataset
import logging
from typing import Optional, List
from typing import Optional, List, Union, Iterable
from deepchem.utils.typing import RDKitMol, RDKitAtom


@@ -653,10 +653,10 @@ class ConvMolFeaturizer(MolecularFeaturizer):
  name = ['conv_mol']

  def __init__(self,
               master_atom=False,
               use_chirality=False,
               atom_properties=[],
               per_atom_fragmentation=False):
               master_atom: bool = False,
               use_chirality: bool = False,
               atom_properties: Iterable[str] = [],
               per_atom_fragmentation: bool = False):
    """
    Parameters
    ----------
@@ -695,7 +695,10 @@ class ConvMolFeaturizer(MolecularFeaturizer):
    self.atom_properties = list(atom_properties)
    self.per_atom_fragmentation = per_atom_fragmentation

  def featurize(self, molecules, log_every_n=1000) -> np.ndarray:
  def featurize(
      self,
      molecules: Union[RDKitMol, str, Iterable[RDKitMol], Iterable[str]],
      log_every_n: int = 1000) -> np.ndarray:
    """
    Override parent: aim is to add handling atom-depleted molecules featurization
    
@@ -715,13 +718,14 @@ class ConvMolFeaturizer(MolecularFeaturizer):
    features = super(ConvMolFeaturizer, self).featurize(
        molecules, log_every_n=1000)
    if self.per_atom_fragmentation:
      # create temporary valid ids seving to filter out failed featurizations from every sublist
      # of features (i.e. every molecules' frags list), and also totally failed sublists.
      # This makes output digestable by Loaders
      valid_frag_inds = [[
          True if np.array(elt).size > 0 else False for elt in f
      ] for f in features]
      features = np.array(
          [[elt for (is_valid, elt) in zip(l, m) if is_valid]
           for (l, m) in zip(valid_frag_inds, features) if any(l)],
          dtype=object)
      features = [[elt for (is_valid, elt) in zip(l, m) if is_valid]
                  for (l, m) in zip(valid_frag_inds, features) if any(l)]
    return features

  def _get_atom_properties(self, atom):
@@ -760,7 +764,7 @@ class ConvMolFeaturizer(MolecularFeaturizer):

      Parameters
      ----------
      n: array of nodes (number_of_nodes X number_of_features)
      n: np.array of nodes (number_of_nodes X number_of_features)
      a: list of nested lists of adjacent node pairs

      """
+1 −0
Original line number Diff line number Diff line
@@ -21,4 +21,5 @@ from deepchem.trans.transformers import FeaturizationTransformer
from deepchem.trans.transformers import ImageTransformer
from deepchem.trans.transformers import DataTransforms
from deepchem.trans.transformers import Transformer
from deepchem.trans.transformers import FlatteningTransformer
from deepchem.trans.duplicate import DuplicateBalancingTransformer
Loading