Unverified Commit eab6ee63 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #2544 from atreyamaj/Atreya_MAT

[WIP] Adding the MAT Featurizer 
parents 885b2adb 374fab1a
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -33,6 +33,7 @@ from deepchem.feat.molecule_featurizers import RawFeaturizer
from deepchem.feat.molecule_featurizers import RDKitDescriptors
from deepchem.feat.molecule_featurizers import SmilesToImage
from deepchem.feat.molecule_featurizers import SmilesToSeq, create_char_to_idx
from deepchem.feat.molecule_featurizers import MATFeaturizer

# complex featurizers
from deepchem.feat.complex_featurizers import RdkitGridFeaturizer
+1 −0
Original line number Diff line number Diff line
@@ -17,3 +17,4 @@ from deepchem.feat.molecule_featurizers.smiles_to_seq import SmilesToSeq, create
from deepchem.feat.molecule_featurizers.mol_graph_conv_featurizer import MolGraphConvFeaturizer
from deepchem.feat.molecule_featurizers.mol_graph_conv_featurizer import PagtnMolGraphFeaturizer
from deepchem.feat.molecule_featurizers.molgan_featurizer import MolGanFeaturizer
from deepchem.feat.molecule_featurizers.mat_featurizer import MATFeaturizer
+103 −0
Original line number Diff line number Diff line
from deepchem.feat.base_classes import MolecularFeaturizer
from deepchem.utils.molecule_feature_utils import one_hot_encode
from deepchem.utils.typing import RDKitMol, RDKitAtom
import numpy as np


class MATFeaturizer(MolecularFeaturizer):
  """
  This class is a featurizer for the Molecule Attention Transformer [1]_.
  The featurizer accepts an RDKit Molecule, and a boolean (one_hot_formal_charge) as arguments.
  The returned value is a numpy array which consists of molecular graph descriptions:
    - Node Features
    - Adjacency Matrix
    - Distance Matrix

  References
  ---------
  .. [1] Lukasz Maziarka et al. "Molecule Attention Transformer`<https://arxiv.org/abs/2002.08264>`"

  Examples
  --------
  >>> import deepchem as dc
  >>> feat = dc.feat.MATFeaturizer()
  >>> out = feat.featurize("CCC")

  Note
  ----
  This class requires RDKit to be installed.
  """

  def __init__(
      self,
      one_hot_formal_charge: bool = True,
  ):
    """
    Parameters
    ----------
    one_hot_formal_charge: bool, default True
      If True, formal charges on atoms are one-hot encoded.
    """

    self.one_hot_formal_charge = one_hot_formal_charge

  def atom_features(self, atom: RDKitAtom) -> np.ndarray:
    """
    Deepchem already contains an atom_features function, however we are defining a new one here due to the need to handle features specific to MAT.
    Since we need new features like Atom GetNeighbors and IsInRing, and the number of features required for MAT is a fraction of what the Deepchem atom_features function computes, we can speed up computation by defining a custom function.

    Parameters
    ----------
    atom: RDKitAtom
      RDKit Atom object.

    Returns
    ----------
    Atom_features: ndarray
      Numpy array containing atom features.

    """
    attrib = []
    attrib += one_hot_encode(atom.GetAtomicNum(),
                             [5, 6, 7, 8, 9, 15, 16, 17, 35, 53, 999])
    attrib += one_hot_encode(len(atom.GetNeighbors()), [0, 1, 2, 3, 4, 5])
    attrib += one_hot_encode(atom.GetTotalNumHs(), [0, 1, 2, 3, 4])

    if self.one_hot_formal_charge:
      attrib += one_hot_encode(atom.GetFormalCharge(), [-1, 0, 1])
    else:
      attrib.append(atom.GetFormalCharge())

    attrib.append(atom.IsInRing())
    attrib.append(atom.GetIsAromatic())

    return np.array(attrib, dtype=np.float32)

  def _featurize(self, mol: RDKitMol) -> np.ndarray:
    """
    Featurize the molecule.

    Parameters
    ----------
    mol: RDKitMol
      RDKit mol object.

    Returns
    -------
    np.ndarray: A concatenated matrix consisting of node_features, adjacency_matrix and distance_matrix.
    """

    try:
      from rdkit import Chem
    except:
      raise ImportError("This class requires RDKit to be installed.")

    node_features = np.array(
        [self.atom_features(atom) for atom in mol.GetAtoms()])
    adjacency_matrix = Chem.rdmolops.GetAdjacencyMatrix(mol)
    distance_matrix = Chem.rdmolops.GetDistanceMatrix(mol)

    result = np.concatenate(
        [node_features, adjacency_matrix, distance_matrix], axis=1)

    return result
+34 −0
Original line number Diff line number Diff line
import unittest
from deepchem.feat import MATFeaturizer
import numpy as np


class TestMATFeaturizer(unittest.TestCase):
  """
  Test MATFeaturizer.
  """

  def setUp(self):
    """
    Set up tests.
    """
    from rdkit import Chem
    smiles = 'CC'
    self.mol = Chem.MolFromSmiles(smiles)

  def test_mat_featurizer(self):
    """
    Test featurizer.py
    """
    featurizer = MATFeaturizer()
    out = featurizer.featurize(self.mol)
    assert (type(out) == np.ndarray)
    assert (out.shape == (1, 2, 31))
    correct_array = np.array([[[
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1.
    ], [
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0.
    ]]])
    assert (np.array_equal(out, correct_array))