Commit e2c3fce1 authored by yurievnamaria's avatar yurievnamaria
Browse files

Merge-commit from atomic_contributions (from f8ecb1f1642002652606a5e8ead0eb3526c9152a)

parent 311692fd
Loading
Loading
Loading
Loading
+58 −12
Original line number Diff line number Diff line
@@ -15,7 +15,7 @@ import numpy as np
from deepchem.utils.typing import OneOrMany
from deepchem.utils.data_utils import load_image_files, load_csv_files, load_json_files, load_sdf_files
from deepchem.utils.genomics_utils import encode_bio_sequence
from deepchem.feat import UserDefinedFeaturizer, Featurizer
from deepchem.feat import UserDefinedFeaturizer, Featurizer, ConvMolFeaturizer
from deepchem.data import Dataset, DiskDataset, NumpyDataset, ImageDataset

logger = logging.getLogger(__name__)
@@ -205,6 +205,18 @@ class DataLoader(object):
        time1 = time.time()
        X, valid_inds = self._featurize_shard(shard)
        ids = shard[self.id_field].values
        #  special case when we deal with  dataset of molecular fragments:
        #  each fragment should recieve id: parent mol id
        # also, x should be flattened, because it is list of lists (one list of frags per mol)
        if isinstance(
            self.featurizer,
            ConvMolFeaturizer) and self.featurizer.per_atom_fragmentation:
          ids = ids[[
              any(i) for i in valid_inds
          ]]  # keep an id if at least one frag was generated from the mol
          ids = np.repeat(ids, [len(i) for i in X], axis=0)
          X = np.array([j for i in X for j in i])  # flatten
        else:
          ids = ids[valid_inds]
        if len(self.tasks) > 0:
          # Featurize task results iff they exist.
@@ -353,6 +365,11 @@ class CSVLoader(DataLoader):
      self.user_specified_features = featurizer.feature_fields
    self.featurizer = featurizer
    self.log_every_n = log_every_n
    if isinstance(self.featurizer, ConvMolFeaturizer):
      if self.featurizer.per_atom_fragmentation and len(self.tasks) > 0:
        self.tasks = []  # no sense in y and w for fragments
        warnings.warn(
            "Tasks and weights will be ignored, fragments can't have them")

  def _get_shards(self, input_files: List[str],
                  shard_size: Optional[int]) -> Iterator[pd.DataFrame]:
@@ -393,11 +410,23 @@ class CSVLoader(DataLoader):
      raise ValueError(
          "featurizer must be specified in constructor to featurizer data/")
    features = [elt for elt in self.featurizer(shard[self.feature_field])]
    if isinstance(self.featurizer,
                  ConvMolFeaturizer) and self.featurizer.per_atom_fragmentation:
      # special case when we deal with fragments dataset:
      # ids and features should be cleaned from failed elements, but retain nested structure
      valid_inds = [[True if np.array(elt).size > 0 else False for elt in f]
                    for f in features]
      features = [[elt for (is_valid, elt) in zip(l, m) if is_valid]
                  for (l, m) in zip(valid_inds, features) if any(l)]
    else:
      valid_inds = np.array(
          [1 if np.array(elt).size > 0 else 0 for elt in features], dtype=bool)
      features = [
          elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
      ]
    if isinstance(self.featurizer, ConvMolFeaturizer):
      if self.featurizer.per_atom_fragmentation:
        return features, valid_inds  # we dont convert to array, the structure is nested
    return np.array(features), valid_inds


@@ -737,6 +766,11 @@ class SDFLoader(DataLoader):
    # The field in which load_sdf_files return value stores smiles
    self.id_field = "smiles"
    self.log_every_n = log_every_n
    if isinstance(self.featurizer, ConvMolFeaturizer):
      if self.featurizer.per_atom_fragmentation and len(self.tasks) > 0:
        self.tasks = []  # no sense in y and w for fragments
        warnings.warn(
            "Tasks and weights will be ignored, fragments can't have them")

  def _get_shards(self, input_files: List[str],
                  shard_size: Optional[int]) -> Iterator[pd.DataFrame]:
@@ -781,11 +815,23 @@ class SDFLoader(DataLoader):
      sample in the source.
    """
    features = [elt for elt in self.featurizer(shard[self.mol_field])]
    if isinstance(self.featurizer,
                  ConvMolFeaturizer) and self.featurizer.per_atom_fragmentation:
      # special case when we deal with fragments dataset:
      # ids and features should be cleaned from failed elements, but retain nested structure
      valid_inds = [[True if np.array(elt).size > 0 else False for elt in f]
                    for f in features]
      features = [[elt for (is_valid, elt) in zip(l, m) if is_valid]
                  for (l, m) in zip(valid_inds, features) if any(l)]
    else:
      valid_inds = np.array(
          [1 if np.array(elt).size > 0 else 0 for elt in features], dtype=bool)
      features = [
          elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
      ]
    if isinstance(self.featurizer,
                  ConvMolFeaturizer) and self.featurizer.per_atom_fragmentation:
      return features, valid_inds  # we dont convert to array, the structure is nested with variable length
    return np.array(features), valid_inds


+16 −0
Original line number Diff line number Diff line
@@ -15,3 +15,19 @@ def test_load_singleton_csv():
  X = loader.create_dataset(fin.name)
  assert len(X) == 1
  os.remove(fin.name)


def test_singleton_csv_load_with_per_atom_fragmentation():
  """Test a case where special form of  dataaset is created from csv:
   dataset of fragments of molecules  for subsequent model interpretation """
  fin = tempfile.NamedTemporaryFile(mode='w', delete=False)
  fin.write("smiles,endpoint\nc1ccccc1,1")
  fin.close()
  featurizer = dc.feat.ConvMolFeaturizer(per_atom_fragmentation=True)
  tasks = ["endpoint"]
  loader = dc.data.CSVLoader(
      tasks=tasks, feature_field="smiles", featurizer=featurizer)

  X = loader.create_dataset(fin.name)
  assert len(X) == 6
  os.remove(fin.name)
+12 −0
Original line number Diff line number Diff line
@@ -57,3 +57,15 @@ def test_sdf_load_with_csv():
  assert len(dataset) == 10
  assert dataset.get_number_shards() == 10
  assert dataset.get_task_names() == ["atomization_energy"]


def test_sdf_load_with_per_atom_fragmentation():
  """Test a case where special form of  dataaset is created from SDF:
   dataset of fragments of molecules  for subsequent model interpretation """
  current_dir = os.path.dirname(os.path.realpath(__file__))
  featurizer = dc.feat.ConvMolFeaturizer(per_atom_fragmentation=True)
  loader = dc.data.SDFLoader(
      ["LogP(RRCK)"], featurizer=featurizer, sanitize=True)
  dataset = loader.create_dataset(
      os.path.join(current_dir, "membrane_permeability.sdf"))
  assert len(dataset) == 98
+57 −24
Original line number Diff line number Diff line
@@ -643,8 +643,11 @@ class ConvMolFeaturizer(MolecularFeaturizer):
  """
  name = ['conv_mol']

  def __init__(self, master_atom=False, use_chirality=False,
               atom_properties=[]):
  def __init__(self,
               master_atom=False,
               use_chirality=False,
               atom_properties=[],
               per_atom_fragmentation=False):
    """
    Parameters
    ----------
@@ -668,6 +671,10 @@ class ConvMolFeaturizer(MolecularFeaturizer):
      provided in atom_properties.  So "atom 00000000 sasa" would be the
      name of the molecule level property in mol where the solvent
      accessible surface area of atom 0 would be stored.
    per_atom_fragmentation: Boolean
      If True, then multiple "atom-deprived" featurizations will be possible to do for each molecule. It will be
      possible to remove atoms  one by one, and then, featurize each atom-deprived molecule.
      Thus, applying featurize method  will produce a set of ConvMol objects for each molecule.

    Since ConvMol is an object and not a numpy array, need to set dtype to
    object.
@@ -676,6 +683,7 @@ class ConvMolFeaturizer(MolecularFeaturizer):
    self.master_atom = master_atom
    self.use_chirality = use_chirality
    self.atom_properties = list(atom_properties)
    self.per_atom_fragmentation = per_atom_fragmentation

  def _get_atom_properties(self, atom):
    """
@@ -700,7 +708,31 @@ class ConvMolFeaturizer(MolecularFeaturizer):
    return np.array(values)

  def _featurize(self, mol):
    """Encodes mol as a ConvMol object."""
    """Encodes mol as a ConvMol object.
    If per_atom_fragmentation is True,
    then for each molecule a list of ConvMolObjects
    will be created"""

    def per_atom(n, a):
      """
      Enumerates fragments resulting from mol object,
      s.t. each fragment = mol with single atom removed (all possible removals are enumerated)
      n - list of nodes, a - adjacency list
      """
      for i in range(n.shape[0]):
        new_n = np.delete(n, (i), axis=0)
        new_a = []
        for j, vertices in enumerate(a):
          if i != j:
            tmp_v = []
            for v in vertices:
              if v < i:
                tmp_v.append(v)
              elif v > i:
                tmp_v.append(v - 1)
            new_a.append(tmp_v)
        yield new_n, new_a

    # Get the node features
    idx_nodes = [(a.GetIdx(),
                  np.concatenate((atom_features(
@@ -718,10 +750,8 @@ class ConvMolFeaturizer(MolecularFeaturizer):
      nodes = np.concatenate([nodes, master_atom_features], axis=0)

    # Get bond lists with reverse edges included
    edge_list = [
        (b.GetBeginAtomIdx(), b.GetEndAtomIdx()) for b in mol.GetBonds()
    ]

    edge_list = [(b.GetBeginAtomIdx(), b.GetEndAtomIdx())
                 for b in mol.GetBonds()]
    # Get canonical adjacency list
    canon_adj_list = [[] for mol_id in range(len(nodes))]
    for edge in edge_list:
@@ -733,7 +763,10 @@ class ConvMolFeaturizer(MolecularFeaturizer):
      for index in range(len(nodes) - 1):
        canon_adj_list[index].append(fake_atom_index)

    if not self.per_atom_fragmentation:
      return ConvMol(nodes, canon_adj_list)
    else:
      return [ConvMol(n, a) for n, a in per_atom(nodes, canon_adj_list)]

  def feature_length(self):
    return 75 + len(self.atom_properties)
+11 −0
Original line number Diff line number Diff line
@@ -88,6 +88,17 @@ class TestConvMolFeaturizer(unittest.TestCase):
    assert np.array_equal(deg_adj_lists[5], np.zeros([0, 5], dtype=np.int32))
    assert np.array_equal(deg_adj_lists[6], np.zeros([0, 6], dtype=np.int32))

  def test_per_atom_fragmentation(self):
    """checks if instantiating featurizer with per_atom_fragmentation=True
    leads to  as many fragments' features, as many atoms mol has for any mol"""
    import rdkit.Chem
    raw_smiles = ['CC(CO)Cc1ccccc1', 'CC']
    mols = [rdkit.Chem.MolFromSmiles(m) for m in raw_smiles]
    featurizer = ConvMolFeaturizer(per_atom_fragmentation=True)
    feat = featurizer.featurize(mols)
    for i, j in zip(feat, mols):
      assert len(i) == j.GetNumHeavyAtoms()


class TestAtomicConvFeaturizer(unittest.TestCase):

Loading