Commit 16eb424b authored by yurievnamaria's avatar yurievnamaria
Browse files

Added model interpretation functionality: ability to calculate contributions...

Added model interpretation functionality: ability to calculate contributions of atoms based on GraphConvModel. .And example tutorial
parent 3e5f03f4
Loading
Loading
Loading
Loading
+58 −12
Original line number Diff line number Diff line
@@ -15,7 +15,7 @@ import numpy as np
from deepchem.utils.typing import OneOrMany
from deepchem.utils.data_utils import load_image_files, load_csv_files, load_json_files, load_sdf_files
from deepchem.utils.genomics_utils import encode_bio_sequence
from deepchem.feat import UserDefinedFeaturizer, Featurizer
from deepchem.feat import UserDefinedFeaturizer, Featurizer, ConvMolFeaturizer
from deepchem.data import Dataset, DiskDataset, NumpyDataset, ImageDataset

logger = logging.getLogger(__name__)
@@ -205,6 +205,18 @@ class DataLoader(object):
        time1 = time.time()
        X, valid_inds = self._featurize_shard(shard)
        ids = shard[self.id_field].values
        #  special case when we deal with  dataset of molecular fragments:
        #  each fragment should recieve id: parent mol id
        # also, x should be flattened, because it is list of lists (one list of frags per mol)
        if isinstance(
            self.featurizer,
            ConvMolFeaturizer) and self.featurizer.per_atom_fragmentation:
          ids = ids[[
              any(i) for i in valid_inds
          ]]  # keep an id if at least one frag was generated from the mol
          ids = np.repeat(ids, [len(i) for i in X], axis=0)
          X = np.array([j for i in X for j in i])  # flatten
        else:
          ids = ids[valid_inds]
        if len(self.tasks) > 0:
          # Featurize task results iff they exist.
@@ -353,6 +365,11 @@ class CSVLoader(DataLoader):
      self.user_specified_features = featurizer.feature_fields
    self.featurizer = featurizer
    self.log_every_n = log_every_n
    if isinstance(self.featurizer, ConvMolFeaturizer):
      if self.featurizer.per_atom_fragmentation and len(self.tasks) > 0:
        self.tasks = []  # no sense in y and w for fragments
        warnings.warn(
            "Tasks and weights will be ignored, fragments can't have them")

  def _get_shards(self, input_files: List[str],
                  shard_size: Optional[int]) -> Iterator[pd.DataFrame]:
@@ -393,11 +410,23 @@ class CSVLoader(DataLoader):
      raise ValueError(
          "featurizer must be specified in constructor to featurizer data/")
    features = [elt for elt in self.featurizer(shard[self.feature_field])]
    if isinstance(self.featurizer,
                  ConvMolFeaturizer) and self.featurizer.per_atom_fragmentation:
      # special case when we deal with fragments dataset:
      # ids and features should be cleaned from failed elements, but retain nested structure
      valid_inds = [[True if np.array(elt).size > 0 else False for elt in f]
                    for f in features]
      features = [[elt for (is_valid, elt) in zip(l, m) if is_valid]
                  for (l, m) in zip(valid_inds, features) if any(l)]
    else:
      valid_inds = np.array(
          [1 if np.array(elt).size > 0 else 0 for elt in features], dtype=bool)
      features = [
          elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
      ]
    if isinstance(self.featurizer, ConvMolFeaturizer):
      if self.featurizer.per_atom_fragmentation:
        return features, valid_inds  # we dont convert to array, the structure is nested
    return np.array(features), valid_inds


@@ -727,6 +756,11 @@ class SDFLoader(DataLoader):
    # The field in which load_sdf_files return value stores smiles
    self.id_field = "smiles"
    self.log_every_n = log_every_n
    if isinstance(self.featurizer, ConvMolFeaturizer):
      if self.featurizer.per_atom_fragmentation and len(self.tasks) > 0:
        self.tasks = []  # no sense in y and w for fragments
        warnings.warn(
            "Tasks and weights will be ignored, fragments can't have them")

  def _get_shards(self, input_files: List[str],
                  shard_size: Optional[int]) -> Iterator[pd.DataFrame]:
@@ -771,11 +805,23 @@ class SDFLoader(DataLoader):
      sample in the source.
    """
    features = [elt for elt in self.featurizer(shard[self.mol_field])]
    if isinstance(self.featurizer,
                  ConvMolFeaturizer) and self.featurizer.per_atom_fragmentation:
      # special case when we deal with fragments dataset:
      # ids and features should be cleaned from failed elements, but retain nested structure
      valid_inds = [[True if np.array(elt).size > 0 else False for elt in f]
                    for f in features]
      features = [[elt for (is_valid, elt) in zip(l, m) if is_valid]
                  for (l, m) in zip(valid_inds, features) if any(l)]
    else:
      valid_inds = np.array(
          [1 if np.array(elt).size > 0 else 0 for elt in features], dtype=bool)
      features = [
          elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
      ]
    if isinstance(self.featurizer,
                  ConvMolFeaturizer) and self.featurizer.per_atom_fragmentation:
      return features, valid_inds  # we dont convert to array, the structure is nested with variable length
    return np.array(features), valid_inds


+41 −7
Original line number Diff line number Diff line
@@ -643,8 +643,11 @@ class ConvMolFeaturizer(MolecularFeaturizer):
  """
  name = ['conv_mol']

  def __init__(self, master_atom=False, use_chirality=False,
               atom_properties=[]):
  def __init__(self,
               master_atom=False,
               use_chirality=False,
               atom_properties=[],
               per_atom_fragmentation=False):
    """
    Parameters
    ----------
@@ -668,6 +671,10 @@ class ConvMolFeaturizer(MolecularFeaturizer):
      provided in atom_properties.  So "atom 00000000 sasa" would be the
      name of the molecule level property in mol where the solvent
      accessible surface area of atom 0 would be stored.
    per_atom_fragmentation: Boolean
      If True, then multiple "atom-deprived" featurizations will be possible to do for each molecule. It will be
      possible to remove atoms  one by one, and then, featurize each atom-deprived molecule.
      Thus, applying featurize method  will produce a set of ConvMol objects for each molecule.

    Since ConvMol is an object and not a numpy array, need to set dtype to
    object.
@@ -676,6 +683,7 @@ class ConvMolFeaturizer(MolecularFeaturizer):
    self.master_atom = master_atom
    self.use_chirality = use_chirality
    self.atom_properties = list(atom_properties)
    self.per_atom_fragmentation = per_atom_fragmentation

  def _get_atom_properties(self, atom):
    """
@@ -700,7 +708,31 @@ class ConvMolFeaturizer(MolecularFeaturizer):
    return np.array(values)

  def _featurize(self, mol):
    """Encodes mol as a ConvMol object."""
    """Encodes mol as a ConvMol object.
    If per_atom_fragmentation is True,
    then for each molecule a list of ConvMolObjects
    will be created"""

    def per_atom(n, a):
      """	
      Enumerates fragments resulting from mol object,
      s.t. each fragment = mol with single atom removed	(all possible removals are enumerated)
      n - list of nodes, a - adjacency list	
      """
      for i in range(n.shape[0]):
        new_n = np.delete(n, (i), axis=0)
        new_a = []
        for j, vertices in enumerate(a):
          if i != j:
            tmp_v = []
            for v in vertices:
              if v < i:
                tmp_v.append(v)
              elif v > i:
                tmp_v.append(v - 1)
            new_a.append(tmp_v)
        yield new_n, new_a

    # Get the node features
    idx_nodes = [(a.GetIdx(),
                  np.concatenate((atom_features(
@@ -718,9 +750,8 @@ class ConvMolFeaturizer(MolecularFeaturizer):
      nodes = np.concatenate([nodes, master_atom_features], axis=0)

    # Get bond lists with reverse edges included
    edge_list = [
        (b.GetBeginAtomIdx(), b.GetEndAtomIdx()) for b in mol.GetBonds()
    ]
    edge_list = [(b.GetBeginAtomIdx(), b.GetEndAtomIdx())
                 for b in mol.GetBonds()]

    # Get canonical adjacency list
    canon_adj_list = [[] for mol_id in range(len(nodes))]
@@ -733,7 +764,10 @@ class ConvMolFeaturizer(MolecularFeaturizer):
      for index in range(len(nodes) - 1):
        canon_adj_list[index].append(fake_atom_index)

    if not self.per_atom_fragmentation:
      return ConvMol(nodes, canon_adj_list)
    else:
      return [ConvMol(n, a) for n, a in per_atom(nodes, canon_adj_list)]

  def feature_length(self):
    return 75 + len(self.atom_properties)
+7232 −0

File added.

Preview size limit exceeded, changes collapsed.

+15414 −0

File added.

Preview size limit exceeded, changes collapsed.

+61741 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading