Commit 443540b7 authored by DavidRFB's avatar DavidRFB
Browse files

Merge branch 'master' of https://github.com/deepchem/deepchem into tutorial_ProteinDL

parents 709f0faf a8ead748
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -40,6 +40,7 @@ DeepChem currently supports Python 3.6 through 3.7 and requires these packages o
- [TensorFlow](https://www.tensorflow.org/)
  - `deepchem>=2.4.0` depends on TensorFlow v2
  - `deepchem<2.4.0` depends on TensorFlow v1
- [Tensorflow Addons](https://www.tensorflow.org/addons) for Tensorflow v2 if you want to use advanced optimizers such as AdamW and Sparse Adam. (Optional)

### Soft Requirements

+1 −0
Original line number Diff line number Diff line
@@ -25,6 +25,7 @@ from deepchem.feat.molecule_featurizers import MACCSKeysFingerprint
from deepchem.feat.molecule_featurizers import MordredDescriptors
from deepchem.feat.molecule_featurizers import Mol2VecFingerprint
from deepchem.feat.molecule_featurizers import MolGraphConvFeaturizer
from deepchem.feat.molecule_featurizers import PagtnMolGraphFeaturizer
from deepchem.feat.molecule_featurizers import MolGanFeaturizer
from deepchem.feat.molecule_featurizers import OneHotFeaturizer
from deepchem.feat.molecule_featurizers import PubChemFingerprint
+1 −0
Original line number Diff line number Diff line
@@ -15,4 +15,5 @@ from deepchem.feat.molecule_featurizers.rdkit_descriptors import RDKitDescriptor
from deepchem.feat.molecule_featurizers.smiles_to_image import SmilesToImage
from deepchem.feat.molecule_featurizers.smiles_to_seq import SmilesToSeq, create_char_to_idx
from deepchem.feat.molecule_featurizers.mol_graph_conv_featurizer import MolGraphConvFeaturizer
from deepchem.feat.molecule_featurizers.mol_graph_conv_featurizer import PagtnMolGraphFeaturizer
from deepchem.feat.molecule_featurizers.molgan_featurizer import MolGanFeaturizer
+39 −2
Original line number Diff line number Diff line
@@ -58,13 +58,12 @@ class Mol2VecFingerprint(MolecularFeaturizer):
    """
    try:
      from gensim.models import word2vec
      from mol2vec.features import mol2alt_sentence, sentences2vec
      from mol2vec.features import mol2alt_sentence
    except ModuleNotFoundError:
      raise ImportError("This class requires mol2vec to be installed.")

    self.radius = radius
    self.unseen = unseen
    self.sentences2vec = sentences2vec
    self.mol2alt_sentence = mol2alt_sentence
    if pretrain_model_path is None:
      data_dir = get_data_dir()
@@ -78,6 +77,44 @@ class Mol2VecFingerprint(MolecularFeaturizer):
    # load pretrained models
    self.model = word2vec.Word2Vec.load(pretrain_model_path)

  def sentences2vec(self, sentences: list, model, unseen=None) -> np.ndarray:
    """Generate vectors for each sentence (list) in a list of sentences. Vector is simply a
    sum of vectors for individual words.

    Parameters
    ----------
    sentences : list, array
        List with sentences
    model : word2vec.Word2Vec
        Gensim word2vec model
    unseen : None, str
        Keyword for unseen words. If None, those words are skipped.
        https://stats.stackexchange.com/questions/163005/how-to-set-the-dictionary-for-text-analysis-using-neural-networks/163032#163032
    Returns
    -------
    np.array
    """
    keys = set(model.wv.key_to_index.keys())
    vec = []
    if unseen:
      unseen_vec = model.wv.get_vector(unseen)

    for sentence in sentences:
      if unseen:
        vec.append(
            sum([
                model.wv.get_vector(y)
                if y in set(sentence) & keys else unseen_vec for y in sentence
            ]))
      else:
        vec.append(
            sum([
                model.wv.get_vector(y)
                for y in sentence
                if y in set(sentence) & keys
            ]))
    return np.array(vec)

  def _featurize(self, mol: RDKitMol) -> np.ndarray:
    """
    Calculate Mordred descriptors.
+222 −0
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@ import numpy as np
from deepchem.utils.typing import RDKitAtom, RDKitBond, RDKitMol
from deepchem.feat.graph_data import GraphData
from deepchem.feat.base_classes import MolecularFeaturizer
from deepchem.utils.molecule_feature_utils import one_hot_encode
from deepchem.utils.molecule_feature_utils import get_atom_type_one_hot
from deepchem.utils.molecule_feature_utils import construct_hydrogen_bonding_info
from deepchem.utils.molecule_feature_utils import get_atom_hydrogen_bonding_one_hot
@@ -18,6 +19,11 @@ from deepchem.utils.molecule_feature_utils import get_bond_type_one_hot
from deepchem.utils.molecule_feature_utils import get_bond_is_in_same_ring_one_hot
from deepchem.utils.molecule_feature_utils import get_bond_is_conjugated_one_hot
from deepchem.utils.molecule_feature_utils import get_bond_stereo_one_hot
from deepchem.utils.molecule_feature_utils import get_atom_formal_charge_one_hot
from deepchem.utils.molecule_feature_utils import get_atom_implicit_valence_one_hot
from deepchem.utils.molecule_feature_utils import get_atom_explicit_valence_one_hot
from deepchem.utils.rdkit_utils import compute_all_pairs_shortest_path
from deepchem.utils.rdkit_utils import compute_pairwise_ring_info


def _construct_atom_feature(
@@ -217,3 +223,219 @@ class MolGraphConvFeaturizer(MolecularFeaturizer):
        node_features=atom_features,
        edge_index=np.asarray([src, dest], dtype=int),
        edge_features=bond_features)


class PagtnMolGraphFeaturizer(MolecularFeaturizer):
  """This class is a featuriser of PAGTN graph networks for molecules.

  The featurization is based on `PAGTN model <https://arxiv.org/abs/1905.12712>`_. It is
  slightly more computationally intensive than default Graph Convolution Featuriser, but it
  builds a Molecular Graph connecting all atom pairs accounting for interactions of an atom with
  every other atom in the Molecule. According to the paper, interactions between two pairs
  of atom are dependent on the relative distance between them and and hence, the function needs
  to calculate the shortest path between them.

  The default node representation is constructed by concatenating the following values,
  and the feature length is 94.

  - Atom type: One hot encoding of the atom type. It consists of the most possible elements in a chemical compound.
  - Formal charge: One hot encoding of formal charge of the atom.
  - Degree: One hot encoding of the atom degree
  - Explicit Valence: One hot encoding of explicit valence of an atom. The supported possibilities
    include ``0 - 6``.
  - Implicit Valence: One hot encoding of implicit valence of an atom. The supported possibilities
    include ``0 - 5``.
  - Aromaticity: Boolean representing if an atom is aromatic.

  The default edge representation is constructed by concatenating the following values,
  and the feature length is 42. It builds a complete graph where each node is connected to
  every other node. The edge representations are calculated based on the shortest path between two nodes
  (choose any one if multiple exist). Each bond encountered in the shortest path is used to
  calculate edge features.

  - Bond type: A one-hot vector of the bond type, "single", "double", "triple", or "aromatic".
  - Conjugated: A one-hot vector of whether this bond is conjugated or not.
  - Same ring: A one-hot vector of whether the atoms in the pair are in the same ring.
  - Ring Size and Aromaticity: One hot encoding of atoms in pair based on ring size and aromaticity.
  - Distance: One hot encoding of the distance between pair of atoms.

  Examples
  --------
  >>> from deepchem.feat import PagtnMolGraphFeaturizer
  >>> smiles = ["C1CCC1", "C1=CC=CN=C1"]
  >>> featurizer = PagtnMolGraphFeaturizer(max_length=5)
  >>> out = featurizer.featurize(smiles)
  >>> type(out[0])
  <class 'deepchem.feat.graph_data.GraphData'>
  >>> out[0].num_node_features
  94
  >>> out[0].num_edge_features
  42

  References
  ----------
  .. [1] Chen, Barzilay, Jaakkola "Path-Augmented Graph Transformer Network"
     10.26434/chemrxiv.8214422.

  Note
  ----
  This class requires RDKit to be installed.

  """

  def __init__(self, max_length=5):
    """
    Parameters
    ----------
    max_length : int
      Maximum distance up to which shortest paths must be considered.
      Paths shorter than max_length will be padded and longer will be
      truncated, default to ``5``.
    """

    self.SYMBOLS = [
        'C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe',
        'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd',
        'Co', 'Se', 'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In',
        'Mn', 'Zr', 'Cr', 'Pt', 'Hg', 'Pb', 'W', 'Ru', 'Nb', 'Re', 'Te', 'Rh',
        'Tc', 'Ba', 'Bi', 'Hf', 'Mo', 'U', 'Sm', 'Os', 'Ir', 'Ce', 'Gd', 'Ga',
        'Cs', '*', 'UNK'
    ]

    self.RING_TYPES = [(5, False), (5, True), (6, False), (6, True)]
    self.ordered_pair = lambda a, b: (a, b) if a < b else (b, a)
    self.max_length = max_length

  def _pagtn_atom_featurizer(self, atom: RDKitAtom) -> np.ndarray:
    """Calculate Atom features from RDKit atom object.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    atom_feat: np.ndarray
      numpy vector of atom features.
    """
    atom_type = get_atom_type_one_hot(atom, self.SYMBOLS, False)
    formal_charge = get_atom_formal_charge_one_hot(
        atom, include_unknown_set=False)
    degree = get_atom_total_degree_one_hot(atom, list(range(11)), False)
    exp_valence = get_atom_explicit_valence_one_hot(atom, list(range(7)), False)
    imp_valence = get_atom_implicit_valence_one_hot(atom, list(range(6)), False)
    armoticity = get_atom_is_in_aromatic_one_hot(atom)
    atom_feat = np.concatenate([
        atom_type, formal_charge, degree, exp_valence, imp_valence, armoticity
    ])
    return atom_feat

  def _edge_features(self, mol: RDKitMol, path_atoms: Tuple[int, ...],
                     ring_info) -> np.ndarray:
    """Computes the edge features for a given pair of nodes.

    Parameters
    ----------
    mol : : RDKitMol
        RDKit molecule instance.
    path_atoms: tuple
        Shortest path between the given pair of nodes.
    ring_info: list
        Different rings that contain the pair of atoms
    """
    features = []
    path_bonds = []
    path_length = len(path_atoms)
    for path_idx in range(path_length - 1):
      bond = mol.GetBondBetweenAtoms(path_atoms[path_idx],
                                     path_atoms[path_idx + 1])
      if bond is None:
        import warnings
        warnings.warn('Valid idx of bonds must be passed')
      path_bonds.append(bond)

    for path_idx in range(self.max_length):
      if path_idx < len(path_bonds):
        bond_type = get_bond_type_one_hot(path_bonds[path_idx])
        conjugacy = get_bond_is_conjugated_one_hot(path_bonds[path_idx])
        ring_attach = get_bond_is_in_same_ring_one_hot(path_bonds[path_idx])
        features.append(np.concatenate([bond_type, conjugacy, ring_attach]))
      else:
        features.append(np.zeros(6))

    if path_length + 1 > self.max_length:
      path_length = self.max_length + 1
    position_feature = np.zeros(self.max_length + 2)
    position_feature[path_length] = 1
    features.append(position_feature)
    if ring_info:
      rfeat = [
          one_hot_encode(r, allowable_set=self.RING_TYPES) for r in ring_info
      ]
      # The 1.0 float value represents True Boolean
      rfeat = [1.0] + np.any(rfeat, axis=0).tolist()
      features.append(rfeat)
    else:
      # This will return a boolean vector with all entries False
      features.append([0.0] +
                      one_hot_encode(ring_info, allowable_set=self.RING_TYPES))
    return np.concatenate(features, axis=0)

  def _pagtn_edge_featurizer(self,
                             mol: RDKitMol) -> Tuple[np.ndarray, np.ndarray]:
    """Calculate bond features from RDKit mol object.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    np.ndarray
      Source and Destination node indexes of each bond.
    np.ndarray
      numpy vector of bond features.
    """
    n_atoms = mol.GetNumAtoms()
    # To get the shortest paths between two nodes.
    paths_dict = compute_all_pairs_shortest_path(mol)
    # To get info if two nodes belong to the same ring.
    rings_dict = compute_pairwise_ring_info(mol)
    # Featurizer
    feats = []
    src = []
    dest = []
    for i in range(n_atoms):
      for j in range(n_atoms):
        src.append(i)
        dest.append(j)

        if (i, j) not in paths_dict:
          feats.append(np.zeros(7 * self.max_length + 7))
          continue
        ring_info = rings_dict.get(self.ordered_pair(i, j), [])
        feats.append(self._edge_features(mol, paths_dict[(i, j)], ring_info))

    return np.array([src, dest], dtype=np.int), np.array(feats, dtype=np.float)

  def _featurize(self, mol: RDKitMol) -> GraphData:
    """Calculate molecule graph features from RDKit mol object.

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphData
      A molecule graph with some features.
    """
    node_features = np.asarray(
        [self._pagtn_atom_featurizer(atom) for atom in mol.GetAtoms()],
        dtype=np.float)
    edge_index, edge_features = self._pagtn_edge_featurizer(mol)
    graph = GraphData(node_features, edge_index, edge_features)
    return graph
Loading