Commit 7ab6adc7 authored by Atreya Majumdar's avatar Atreya Majumdar
Browse files

Fixed merge conflicts

parents 3b54f27d dc88b8bd
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -40,6 +40,7 @@ DeepChem currently supports Python 3.6 through 3.7 and requires these packages o
- [TensorFlow](https://www.tensorflow.org/)
  - `deepchem>=2.4.0` depends on TensorFlow v2
  - `deepchem<2.4.0` depends on TensorFlow v1
- [Tensorflow Addons](https://www.tensorflow.org/addons) for Tensorflow v2 if you want to use advanced optimizers such as AdamW and Sparse Adam. (Optional)

### Soft Requirements

+1 −0
Original line number Diff line number Diff line
@@ -25,6 +25,7 @@ from deepchem.feat.molecule_featurizers import MACCSKeysFingerprint
from deepchem.feat.molecule_featurizers import MordredDescriptors
from deepchem.feat.molecule_featurizers import Mol2VecFingerprint
from deepchem.feat.molecule_featurizers import MolGraphConvFeaturizer
from deepchem.feat.molecule_featurizers import PagtnMolGraphFeaturizer
from deepchem.feat.molecule_featurizers import MolGanFeaturizer
from deepchem.feat.molecule_featurizers import OneHotFeaturizer
from deepchem.feat.molecule_featurizers import PubChemFingerprint
+1 −0
Original line number Diff line number Diff line
@@ -15,4 +15,5 @@ from deepchem.feat.molecule_featurizers.rdkit_descriptors import RDKitDescriptor
from deepchem.feat.molecule_featurizers.smiles_to_image import SmilesToImage
from deepchem.feat.molecule_featurizers.smiles_to_seq import SmilesToSeq, create_char_to_idx
from deepchem.feat.molecule_featurizers.mol_graph_conv_featurizer import MolGraphConvFeaturizer
from deepchem.feat.molecule_featurizers.mol_graph_conv_featurizer import PagtnMolGraphFeaturizer
from deepchem.feat.molecule_featurizers.molgan_featurizer import MolGanFeaturizer
+222 −0
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@ import numpy as np
from deepchem.utils.typing import RDKitAtom, RDKitBond, RDKitMol
from deepchem.feat.graph_data import GraphData
from deepchem.feat.base_classes import MolecularFeaturizer
from deepchem.utils.molecule_feature_utils import one_hot_encode
from deepchem.utils.molecule_feature_utils import get_atom_type_one_hot
from deepchem.utils.molecule_feature_utils import construct_hydrogen_bonding_info
from deepchem.utils.molecule_feature_utils import get_atom_hydrogen_bonding_one_hot
@@ -18,6 +19,11 @@ from deepchem.utils.molecule_feature_utils import get_bond_type_one_hot
from deepchem.utils.molecule_feature_utils import get_bond_is_in_same_ring_one_hot
from deepchem.utils.molecule_feature_utils import get_bond_is_conjugated_one_hot
from deepchem.utils.molecule_feature_utils import get_bond_stereo_one_hot
from deepchem.utils.molecule_feature_utils import get_atom_formal_charge_one_hot
from deepchem.utils.molecule_feature_utils import get_atom_implicit_valence_one_hot
from deepchem.utils.molecule_feature_utils import get_atom_explicit_valence_one_hot
from deepchem.utils.rdkit_utils import compute_all_pairs_shortest_path
from deepchem.utils.rdkit_utils import compute_pairwise_ring_info


def _construct_atom_feature(
@@ -217,3 +223,219 @@ class MolGraphConvFeaturizer(MolecularFeaturizer):
        node_features=atom_features,
        edge_index=np.asarray([src, dest], dtype=int),
        edge_features=bond_features)


class PagtnMolGraphFeaturizer(MolecularFeaturizer):
  """This class is a featuriser of PAGTN graph networks for molecules.

  The featurization is based on `PAGTN model <https://arxiv.org/abs/1905.12712>`_. It is
  slightly more computationally intensive than default Graph Convolution Featuriser, but it
  builds a Molecular Graph connecting all tom pairs accounting for interactions of atom with
  every other atom in the Molecule. According to the paper, interactions between two pairs
  of an atom are dependent on the relative distance between them and calculating the shortest
  path between them.

  The default node representation is constructed by concatenating the following values,
  and the feature length is 94.

  - Atom type: One hot encoding of the atom type. It consists of the most possible elements in a chemical compound.
  - Formal charge: One hot encoding of formal charge of the atom.
  - Degree: One hot encoding of the atom degree
  - Explicit Valence: One hot encoding of explicit valence of an atom. The supported possibilities
    include ``0 - 6``.
  - Implicit Valence: One hot encoding of implicit valence of an atom. The supported possibilities
    include ``0 - 5``.
  - Aromaticity: Boolean representing if an atom is aromatic.

  The default edge representation are constructed by concatenating the following values,
  and the feature length is 42. It builds a complete graph where each node is connected to
  every other node. The edge representations are calculated the shortest path between two nodes
  (choose any one if multiple exist). Each bond encountered in the shortest path is used to
  calculate edge features.

  - Bond type: A one-hot vector of the bond type, "single", "double", "triple", or "aromatic".
  - Conjugated: A one-hot vector of whether this bond is conjugated or not.
  - Same ring: A one-hot vector of whether the atoms in the pair are in the same ring.
  - Ring Size and Aromaticity: One hot encoding of atoms in pair based on ring size and aromaticity.
  - Distance: One hot encoding of the distance between pair of atoms.

  Examples
  --------
  >>> from deepchem.feat import PagtnMolGraphFeaturizer
  >>> smiles = ["C1CCC1", "C1=CC=CN=C1"]
  >>> featurizer = PagtnMolGraphFeaturizer(max_length=5)
  >>> out = featurizer.featurize(smiles)
  >>> type(out[0])
  <class 'deepchem.feat.graph_data.GraphData'>
  >>> out[0].num_node_features
  94
  >>> out[0].num_edge_features
  42

  References
  ----------
  .. [1] Chen, Barzilay, Jaakkola "Path-Augmented Graph Transformer Network"
     10.26434/chemrxiv.8214422.

  Note
  ----
  This class requires RDKit to be installed.

  """

  def __init__(self, max_length=5):
    """
    Parameters
    ----------
    max_length : int
      Maximum distance up to which shortest paths must be considered.
      Paths shorter than max_length will be padded and longer will be
      truncated, default to ``5``.
    """

    self.SYMBOLS = [
        'C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe',
        'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd',
        'Co', 'Se', 'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In',
        'Mn', 'Zr', 'Cr', 'Pt', 'Hg', 'Pb', 'W', 'Ru', 'Nb', 'Re', 'Te', 'Rh',
        'Tc', 'Ba', 'Bi', 'Hf', 'Mo', 'U', 'Sm', 'Os', 'Ir', 'Ce', 'Gd', 'Ga',
        'Cs', '*', 'UNK'
    ]

    self.RING_TYPES = [(5, False), (5, True), (6, False), (6, True)]
    self.ordered_pair = lambda a, b: (a, b) if a < b else (b, a)
    self.max_length = max_length

  def _pagtn_atom_featurizer(self, atom: RDKitAtom) -> np.ndarray:
    """Calculate Atom features from RDKit atom object.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    atom_feat: np.ndarray
      numpy vector of atom features.
    """
    atom_type = get_atom_type_one_hot(atom, self.SYMBOLS, False)
    formal_charge = get_atom_formal_charge_one_hot(
        atom, include_unknown_set=False)
    degree = get_atom_total_degree_one_hot(atom, list(range(11)), False)
    exp_valence = get_atom_explicit_valence_one_hot(atom, list(range(7)), False)
    imp_valence = get_atom_implicit_valence_one_hot(atom, list(range(6)), False)
    armoticity = get_atom_is_in_aromatic_one_hot(atom)
    atom_feat = np.concatenate([
        atom_type, formal_charge, degree, exp_valence, imp_valence, armoticity
    ])
    return atom_feat

  def _edge_features(self, mol: RDKitMol, path_atoms: Tuple[int, ...],
                     ring_info) -> np.ndarray:
    """Computes the edge features for a given pair of nodes.

    Parameters
    ----------
    mol : : RDKitMol
        RDKit molecule instance.
    path_atoms: tuple
        Shortest path between the given pair of nodes.
    ring_info: list
        Different rings that contain the pair of atoms
    """
    features = []
    path_bonds = []
    path_length = len(path_atoms)
    for path_idx in range(path_length - 1):
      bond = mol.GetBondBetweenAtoms(path_atoms[path_idx],
                                     path_atoms[path_idx + 1])
      if bond is None:
        import warnings
        warnings.warn('Valid idx of bonds must be passed')
      path_bonds.append(bond)

    for path_idx in range(self.max_length):
      if path_idx < len(path_bonds):
        bond_type = get_bond_type_one_hot(path_bonds[path_idx])
        conjugacy = get_bond_is_conjugated_one_hot(path_bonds[path_idx])
        ring_attach = get_bond_is_in_same_ring_one_hot(path_bonds[path_idx])
        features.append(np.concatenate([bond_type, conjugacy, ring_attach]))
      else:
        features.append(np.zeros(6))

    if path_length + 1 > self.max_length:
      path_length = self.max_length + 1
    position_feature = np.zeros(self.max_length + 2)
    position_feature[path_length] = 1
    features.append(position_feature)
    if ring_info:
      rfeat = [
          one_hot_encode(r, allowable_set=self.RING_TYPES) for r in ring_info
      ]
      # The 1.0 float value represents True Boolean
      rfeat = [1.0] + np.any(rfeat, axis=0).tolist()
      features.append(rfeat)
    else:
      # This will return a boolean vector with all entries False
      features.append([0.0] +
                      one_hot_encode(ring_info, allowable_set=self.RING_TYPES))
    return np.concatenate(features, axis=0)

  def _pagtn_edge_featurizer(self,
                             mol: RDKitMol) -> Tuple[np.ndarray, np.ndarray]:
    """Calculate bond features from RDKit mol object.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    np.ndarray
      Source and Destination node indexes of each bond.
    np.ndarray
      numpy vector of bond features.
    """
    n_atoms = mol.GetNumAtoms()
    # To get the shortest paths between two nodes.
    paths_dict = compute_all_pairs_shortest_path(mol)
    # To get info if two nodes belong to the same ring.
    rings_dict = compute_pairwise_ring_info(mol)
    # Featurizer
    feats = []
    src = []
    dest = []
    for i in range(n_atoms):
      for j in range(n_atoms):
        src.append(i)
        dest.append(j)

        if (i, j) not in paths_dict:
          feats.append(np.zeros(7 * self.max_length + 7))
          continue
        ring_info = rings_dict.get(self.ordered_pair(i, j), [])
        feats.append(self._edge_features(mol, paths_dict[(i, j)], ring_info))

    return np.array([src, dest], dtype=np.int), np.array(feats, dtype=np.float)

  def _featurize(self, mol: RDKitMol) -> GraphData:
    """Calculate molecule graph features from RDKit mol object.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphData
      A molecule graph with some features.
    """
    node_features = np.asarray(
        [self._pagtn_atom_featurizer(atom) for atom in mol.GetAtoms()],
        dtype=np.float)
    edge_index, edge_features = self._pagtn_edge_featurizer(mol)
    graph = GraphData(node_features, edge_index, edge_features)
    return graph
+22 −0
Original line number Diff line number Diff line
import unittest

from deepchem.feat import MolGraphConvFeaturizer
from deepchem.feat import PagtnMolGraphFeaturizer


class TestMolGraphConvFeaturizer(unittest.TestCase):
@@ -70,3 +71,24 @@ class TestMolGraphConvFeaturizer(unittest.TestCase):
    assert graph_feat[1].num_nodes == 22
    assert graph_feat[1].num_node_features == 31
    assert graph_feat[1].num_edges == 44


class TestPagtnMolGraphConvFeaturizer(unittest.TestCase):

  def test_default_featurizer(self):
    smiles = ["C1=CC=CN=C1", "O=C(NCc1cc(OC)c(O)cc1)CCCC/C=C/C(C)C"]
    featurizer = PagtnMolGraphFeaturizer(max_length=5)
    graph_feat = featurizer.featurize(smiles)
    assert len(graph_feat) == 2

    # assert "C1=CC=CN=C1"
    assert graph_feat[0].num_nodes == 6
    assert graph_feat[0].num_node_features == 94
    assert graph_feat[0].num_edges == 36
    assert graph_feat[0].num_edge_features == 42

    # assert "O=C(NCc1cc(OC)c(O)cc1)CCCC/C=C/C(C)C"
    assert graph_feat[1].num_nodes == 22
    assert graph_feat[1].num_node_features == 94
    assert graph_feat[1].num_edges == 484
    assert graph_feat[0].num_edge_features == 42
Loading