Unverified Commit 692a2ed7 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #2571 from arunppsg/master

[WIP] Added examples for featurizers in documentation
parents aaed65f7 2ad936e7
Loading
Loading
Loading
Loading
+22 −0
Original line number Diff line number Diff line
@@ -125,6 +125,28 @@ class AtomicConformationFeaturizer(Featurizer):

  Otherwise, it is assumed to be a SMILES string.  RDKit is used to generate a
  3D conformation and to compute formal and partial charges.

  Examples
  --------
  >>> import deepchem as dc
  >>> smiles = ['CCC']
  >>> featurizer = dc.feat.AtomicConformationFeaturizer()
  >>> features = featurizer.featurize(smiles)
  >>> features[0].num_atoms
  11
  >>> sum(features[0].atomic_number == 6)
  3
  >>> sum(features[0].atomic_number == 1)
  8
  >>> type(features[0].formal_charge)
  <class 'numpy.ndarray'>
  >>> features[0].formal_charge.shape
  (11,)
  >>> type(features[0].partial_charge)
  <class 'numpy.ndarray'>
  >>> features[0].partial_charge.shape
  (11,)

  """

  def _featurize(self, datapoint: str) -> AtomicConformation:
+74 −11
Original line number Diff line number Diff line
@@ -54,6 +54,7 @@ def one_of_k_encoding_unk(x, allowable_set):
  --------
  >>> dc.feat.graph_features.one_of_k_encoding_unk("s", ["a", "b", "c"])
  [False, False, True]

  """
  if x not in allowable_set:
    x = allowable_set[-1]
@@ -78,6 +79,7 @@ def get_intervals(l):

  >>> dc.feat.graph_features.get_intervals([[1], [], [1, 2], [1, 2, 3]])
  [1, 1, 3, 12]

  """
  intervals = len(l) * [0]
  # Initalize with 1
@@ -104,10 +106,11 @@ def safe_index(l, e):
  0
  >>> dc.feat.graph_features.safe_index([1, 2, 3], 7)
  3

  """
  try:
    return l.index(e)
  except:
  except ValueError:
    return len(l)


@@ -149,7 +152,7 @@ def get_feature_list(atom):

  Parameters
  ----------
  atom: RDKit.rdchem.Atom
  atom: RDKit.Chem.rdchem.Atom
    Atom to get features for

  Examples
@@ -157,8 +160,11 @@ def get_feature_list(atom):
  >>> from rdkit import Chem
  >>> mol = Chem.MolFromSmiles("C")
  >>> atom = mol.GetAtoms()[0]
  >>> dc.feat.graph_features.get_feature_list(atom)
  [0, 4, 4, 3, 0, 2]
  >>> features = dc.feat.graph_features.get_feature_list(atom)
  >>> type(features)
  <class 'list'>
  >>> len(features)
  6

  Note
  ----
@@ -259,7 +265,7 @@ def atom_to_id(atom):

  Parameters
  ----------
  atom: RDKit.rdchem.Atom
  atom: RDKit.Chem.rdchem.Atom
    Atom to convert to ids.

  Returns
@@ -281,6 +287,8 @@ def atom_features(atom,

  Parameters
  ----------
  atom: RDKit.Chem.rdchem.Atom
    Atom to compute features on.
  bool_id_feat: bool, optional
    Return an array of unique identifiers corresponding to atom type.
  explicit_H: bool, optional
@@ -290,7 +298,20 @@ def atom_features(atom,

  Returns
  -------
  np.ndarray of per-atom features.
  features: np.ndarray 
    An array of per-atom features.

  Examples
  --------
  >>> from rdkit import Chem
  >>> mol = Chem.MolFromSmiles('CCC')
  >>> atom = mol.GetAtoms()[0]
  >>> features = dc.feat.graph_features.atom_features(atom)
  >>> type(features)
  <class 'numpy.ndarray'>
  >>> features.shape
  (75,)

  """
  if bool_id_feat:
    return np.array([atom_to_id(atom)])
@@ -376,6 +397,8 @@ def bond_features(bond, use_chirality=False):

  Parameters
  ----------
  bond: rdkit.Chem.rdchem.Bond
    Bond to compute features on.
  use_chirality: bool, optional
    If true, use chirality information.
  
@@ -388,6 +411,22 @@ def bond_features(bond, use_chirality=False):
  bond_feats: np.ndarray
    Array of bond features. This is a 1-D array of length 6 if `use_chirality`
    is `False` else of length 10 with chirality encoded.

  Examples
  --------
  >>> from rdkit import Chem
  >>> mol = Chem.MolFromSmiles('CCC')
  >>> bond = mol.GetBonds()[0]
  >>> bond_features = dc.feat.graph_features.bond_features(bond)
  >>> type(bond_features)
  <class 'numpy.ndarray'>
  >>> bond_features.shape
  (6,)

  Note
  ----
  This method requires RDKit to be installed.

  """
  try:
    from rdkit import Chem
@@ -406,8 +445,8 @@ def bond_features(bond, use_chirality=False):
  return np.array(bond_feats)


def max_pair_distance_pairs(mol: RDKitMol,
                            max_pair_distance: Optional[int]) -> np.ndarray:
def max_pair_distance_pairs(
    mol: RDKitMol, max_pair_distance: Optional[int] = None) -> np.ndarray:
  """Helper method which finds atom pairs within max_pair_distance graph distance.

  This helper method is used to find atoms which are within max_pair_distance
@@ -434,6 +473,15 @@ def max_pair_distance_pairs(mol: RDKitMol,
    distance 2 apart. If `max_pair_distance` is `None`, all pairs are
    considered (effectively infinite `max_pair_distance`)

  Examples
  --------
  >>> from rdkit import Chem
  >>> mol = Chem.MolFromSmiles('CCC')
  >>> features = dc.feat.graph_features.max_pair_distance_pairs(mol, 1)
  >>> type(features)
  <class 'numpy.ndarray'>
  >>> features.shape  # (2, num_pairs)
  (2, 7)

  Returns
  -------
@@ -862,9 +910,24 @@ class WeaveFeaturizer(MolecularFeaturizer):
  Examples
  --------
  >>> import deepchem as dc
  >>> mols = ["C", "CCC"]
  >>> mols = ["CCC"]
  >>> featurizer = dc.feat.WeaveFeaturizer()
  >>> X = featurizer.featurize(mols)
  >>> features = featurizer.featurize(mols)
  >>> type(features[0])
  <class 'deepchem.feat.mol_graphs.WeaveMol'>
  >>> features[0].get_num_atoms() # 3 atoms in compound
  3
  >>> features[0].get_num_features() # feature size
  75
  >>> type(features[0].get_atom_features())
  <class 'numpy.ndarray'>
  >>> features[0].get_atom_features().shape
  (3, 75)
  >>> type(features[0].get_pair_features())
  <class 'numpy.ndarray'>
  >>> features[0].get_pair_features().shape
  (9, 14)


  References
  ----------
+16 −0
Original line number Diff line number Diff line
@@ -10,6 +10,22 @@ from deepchem.utils.typing import RDKitMol
class AtomicCoordinates(MolecularFeaturizer):
  """Calculate atomic coordinates.

  Examples
  --------
  >>> import deepchem as dc
  >>> from rdkit import Chem
  >>> mol = Chem.MolFromSmiles('C1C=CC=CC=1')
  >>> n_atoms = len(mol.GetAtoms())
  >>> n_atoms
  6
  >>> featurizer = dc.feat.AtomicCoordinates(use_bohr=False)
  >>> features = featurizer.featurize([mol])
  >>> type(features[0])
  <class 'numpy.ndarray'>
  >>> features[0].shape # (n_atoms, 3)
  (6, 3)


  Note
  ----
  This class requires RDKit to be installed.
+12 −1
Original line number Diff line number Diff line
@@ -9,7 +9,18 @@ from deepchem.feat.molecule_featurizers.atomic_coordinates import AtomicCoordina
class BPSymmetryFunctionInput(MolecularFeaturizer):
  """Calculate symmetry function for each atom in the molecules

  This method is described in [1]_
  This method is described in [1]_.

  Examples
  --------
  >>> import deepchem as dc
  >>> smiles = ['C1C=CC=CC=1']
  >>> featurizer = dc.feat.BPSymmetryFunctionInput(max_atoms=10)
  >>> features = featurizer.featurize(smiles)
  >>> type(features[0])
  <class 'numpy.ndarray'>
  >>> features[0].shape  # (max_atoms, 4)
  (10, 4)

  References
  ----------
+23 −1
Original line number Diff line number Diff line
@@ -14,7 +14,8 @@ class CircularFingerprint(MolecularFeaturizer):

  Extended Connectivity Circular Fingerprints compute a bag-of-words style
  representation of a molecule by breaking it into local neighborhoods and
  hashing into a bit vector of the specified size. See [1]_ for more details.
  hashing into a bit vector of the specified size. It is used specifically
  for structure-activity modelling. See [1]_ for more details.

  References
  ----------
@@ -24,6 +25,27 @@ class CircularFingerprint(MolecularFeaturizer):
  Note
  ----
  This class requires RDKit to be installed.

  Examples
  --------
  >>> import deepchem as dc
  >>> from rdkit import Chem
  >>> smiles = ['C1=CC=CC=C1']
  >>> # Example 1: (size = 2048, radius = 4)
  >>> featurizer = dc.feat.CircularFingerprint(size=2048, radius=4)
  >>> features = featurizer.featurize(smiles)
  >>> type(features[0])
  <class 'numpy.ndarray'>
  >>> features[0].shape
  (2048,)

  >>> # Example 2: (size = 2048, radius = 4, sparse = True, smiles = True)
  >>> featurizer = dc.feat.CircularFingerprint(size=2048, radius=8,
  ...                                          sparse=True, smiles=True)
  >>> features = featurizer.featurize(smiles)
  >>> type(features[0]) # dict containing fingerprints
  <class 'dict'>

  """

  def __init__(self,
Loading