Commit 9038d588 authored by nd-02110114's avatar nd-02110114
Browse files

Merge branch 'master' into update-data-2

parents 118151c9 3d257a0c
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -32,18 +32,18 @@ install:
  - conda update -q conda
  - bash scripts/install_deepchem_conda.sh cpu
  - conda activate deepchem
  - python setup.py install
  - pip install -e .
script:
  - bash devtools/run_yapf.sh
  - bash devtools/run_flake8.sh
  - mypy -p deepchem
  - pytest -m "not slow" --cov=deepchem deepchem
  - pytest -v -m "not slow" --cov=deepchem deepchem
  - if [ $TRAVIS_PYTHON_VERSION == '3.7' ]; then
      cd docs && pip install -r requirements.txt;
      make clean html && cd ..;
    fi
  - if [ $TRAVIS_PYTHON_VERSION == '3.7' ]; then
      find ./deepchem -name "*.py" ! -name '*load_dataset_template.py' | xargs python -m doctest -v;
      pytest -v --ignore-glob='deepchem/**/test*.py' --doctest-modules deepchem;
    fi
after_success:
  - echo $TRAVIS_SECURE_ENV_VARS
+6 −4
Original line number Diff line number Diff line
@@ -190,14 +190,14 @@ def weighted_linear_sum(w: np.ndarray, x: np.ndarray) -> np.ndarray:
  w: np.ndarray
    A numpy array of shape `(N,)`
  x: np.ndarray
    A numpy array of shape `(N,)`
    A numpy array of shape `(N, M, L)`

  Returns
  -------
  np.ndarray
    A scalar value
    A numpy array of shape `(M, L)`
  """
  return np.sum(np.dot(w, x))
  return np.tensordot(w, x, axes=1)


def vina_energy_term(coords1: np.ndarray, coords2: np.ndarray,
@@ -211,7 +211,9 @@ def vina_energy_term(coords1: np.ndarray, coords2: np.ndarray,
  coords2: np.ndarray
    Molecular coordinates of shape `(M, 3)`
  weights: np.ndarray
    A numpy array of shape `(5,)`
    A numpy array of shape `(5,)`. The 5 values are weights for repulsion interaction term,
    hydrophobic interaction term, hydrogen bond interaction term,
    first Gaussian interaction term and second Gaussian interaction term.
  wrot: float
    The scaling factor for nonlinearity
  Nrot: int
+9 −0
Original line number Diff line number Diff line
"""
Making it easy to import in classes.
"""
# flake8: noqa

# base classes for featurizers
from deepchem.feat.base_classes import Featurizer
from deepchem.feat.base_classes import MolecularFeaturizer
from deepchem.feat.base_classes import MaterialStructureFeaturizer
from deepchem.feat.base_classes import MaterialCompositionFeaturizer
from deepchem.feat.base_classes import ComplexFeaturizer
from deepchem.feat.base_classes import UserDefinedFeaturizer

from deepchem.feat.graph_features import ConvMolFeaturizer
from deepchem.feat.graph_features import WeaveFeaturizer
from deepchem.feat.fingerprints import CircularFingerprint
@@ -22,6 +26,11 @@ from deepchem.feat.atomic_coordinates import AtomicCoordinates
from deepchem.feat.atomic_coordinates import NeighborListComplexAtomicCoordinates
from deepchem.feat.adjacency_fingerprints import AdjacencyFingerprint
from deepchem.feat.smiles_featurizers import SmilesToSeq, SmilesToImage

# molecule featurizers
from deepchem.feat.molecule_featurizers import MolGraphConvFeaturizer

# material featurizers
from deepchem.feat.material_featurizers import ElementPropertyFingerprint
from deepchem.feat.material_featurizers import SineCoulombMatrix
from deepchem.feat.material_featurizers import CGCNNFeaturizer
+55 −0
Original line number Diff line number Diff line
"""
Feature calculations.
"""
import inspect
import logging
import numpy as np
import multiprocessing
@@ -75,6 +76,60 @@ class Featurizer(object):
    """
    raise NotImplementedError('Featurizer is not defined.')

  def __repr__(self) -> str:
    """Convert self to repr representation.

    Returns
    -------
    str
      The string represents the class.

    Examples
    --------
    >>> import deepchem as dc
    >>> dc.feat.CircularFingerprint(size=1024, radius=4)
    CircularFingerprint[radius=4, size=1024, chiral=False, bonds=True, features=False, sparse=False, smiles=False]
    >>> dc.feat.CGCNNFeaturizer()
    CGCNNFeaturizer[radius=8.0, max_neighbors=8, step=0.2]
    """
    args_spec = inspect.getfullargspec(self.__init__)  # type: ignore
    args_names = [arg for arg in args_spec.args if arg != 'self']
    args_info = ''
    for arg_name in args_names:
      args_info += arg_name + '=' + str(self.__dict__[arg_name]) + ', '
    return self.__class__.__name__ + '[' + args_info[:-2] + ']'

  def __str__(self) -> str:
    """Convert self to str representation.

    Returns
    -------
    str
      The string represents the class.

    Examples
    --------
    >>> import deepchem as dc
    >>> str(dc.feat.CircularFingerprint(size=1024, radius=4))
    'CircularFingerprint_radius_4_size_1024'
    >>> str(dc.feat.CGCNNFeaturizer())
    'CGCNNFeaturizer'
    """
    args_spec = inspect.getfullargspec(self.__init__)  # type: ignore
    args_names = [arg for arg in args_spec.args if arg != 'self']
    args_num = len(args_names)
    args_default_values = [None for _ in range(args_num)]
    if args_spec.defaults is not None:
      defaults = list(args_spec.defaults)
      args_default_values[-len(defaults):] = defaults

    override_args_info = ''
    for arg_name, default in zip(args_names, args_default_values):
      arg_value = self.__dict__[arg_name]
      if default != arg_value:
        override_args_info += '_' + arg_name + '_' + str(arg_value)
    return self.__class__.__name__ + override_args_info


class ComplexFeaturizer(object):
  """"
+187 −52
Original line number Diff line number Diff line
@@ -5,6 +5,8 @@ from deepchem.feat.atomic_coordinates import ComplexNeighborListFragmentAtomicCo
from deepchem.feat.mol_graphs import ConvMol, WeaveMol
from deepchem.data import DiskDataset
import logging
from typing import Optional, List
from deepchem.utils.typing import RDKitMol, RDKitAtom


def one_of_k_encoding(x, allowable_set):
@@ -398,12 +400,75 @@ def bond_features(bond, use_chirality=False):
  ]
  if use_chirality:
    bond_feats = bond_feats + one_of_k_encoding_unk(
        str(bond.GetStereo()), possible_bond_stereo)
        str(bond.GetStereo()), GraphConvCoonstants.possible_bond_stereo)
  return np.array(bond_feats)


def pair_features(mol, edge_list, canon_adj_list, bt_len=6,
                  graph_distance=True):
def max_pair_distance_pairs(mol: RDKitMol,
                            max_pair_distance: Optional[int]) -> np.ndarray:
  """Helper method which finds atom pairs within max_pair_distance graph distance.

  This helper method is used to find atoms which are within max_pair_distance
  graph_distance of one another. This is done by using the fact that the
  powers of an adjacency matrix encode path connectivity information. In
  particular, if `adj` is the adjacency matrix, then `adj**k` has a nonzero
  value at `(i, j)` if and only if there exists a path of graph distance `k`
  between `i` and `j`. To find all atoms within `max_pair_distance` of each
  other, we can compute the adjacency matrix powers `[adj, adj**2,
  ...,adj**max_pair_distance]` and find pairs which are nonzero in any of
  these matrices. Since adjacency matrices and their powers are positive
  numbers, this is simply the nonzero elements of `adj + adj**2 + ... +
  adj**max_pair_distance`.

  Parameters
  ----------
  mol: rdkit.Chem.rdchem.Mol
    RDKit molecules
  max_pair_distance: Optional[int], (default None)
    This value can be a positive integer or None. This
    parameter determines the maximum graph distance at which pair
    features are computed. For example, if `max_pair_distance==2`,
    then pair features are computed only for atoms at most graph
    distance 2 apart. If `max_pair_distance` is `None`, all pairs are
    considered (effectively infinite `max_pair_distance`)


  Returns
  -------
  np.ndarray
    Of shape `(2, num_pairs)` where `num_pairs` is the total number of pairs
    within `max_pair_distance` of one another.
  """
  from rdkit import Chem
  from rdkit.Chem import rdmolops
  N = len(mol.GetAtoms())
  if (max_pair_distance is None or max_pair_distance >= N):
    max_distance = N
  elif max_pair_distance is not None and max_pair_distance <= 0:
    raise ValueError(
        "max_pair_distance must either be a positive integer or None")
  elif max_pair_distance is not None:
    max_distance = max_pair_distance
  adj = rdmolops.GetAdjacencyMatrix(mol)
  # Handle edge case of self-pairs (i, i)
  sum_adj = np.eye(N)
  for i in range(max_distance):
    # Increment by 1 since we don't want 0-indexing
    power = i + 1
    sum_adj += np.linalg.matrix_power(adj, power)
  nonzero_locs = np.where(sum_adj != 0)
  num_pairs = len(nonzero_locs[0])
  # This creates a matrix of shape (2, num_pairs)
  pair_edges = np.reshape(np.array(list(zip(nonzero_locs))), (2, num_pairs))
  return pair_edges


def pair_features(mol: RDKitMol,
                  bond_features_map: dict,
                  bond_adj_list: List,
                  bt_len: int = 6,
                  graph_distance: bool = True,
                  max_pair_distance: Optional[int] = None) -> np.ndarray:
  """Helper method used to compute atom pair feature vectors.

  Many different featurization methods compute atom pair features
@@ -415,16 +480,26 @@ def pair_features(mol, edge_list, canon_adj_list, bt_len=6,
  ----------
  mol: RDKit Mol
    Molecule to compute features on.
  edge_list: list
    List of edges to consider
  canon_adj_list: list of lists
    `canon_adj_list[i]` is a list of the atom indices that atom `i` shares a
    list. This list is symmetrical so if `j in canon_adj_list[i]` then `i in
    canon_adj_list[j]`.
  bond_features_map: dict 
    Dictionary that maps pairs of atom ids (say `(2, 3)` for a bond between
    atoms 2 and 3) to the features for the bond between them.
  bond_adj_list: list of lists
    `bond_adj_list[i]` is a list of the atom indices that atom `i` shares a
    bond with . This list is symmetrical so if `j in bond_adj_list[i]` then `i
    in bond_adj_list[j]`.
  bt_len: int, optional (default 6)
    The number of different bond types to consider.
  graph_distance: bool, optional (default True)
    If true, use graph distance between molecules. Else use euclidean distance.
    If true, use graph distance between molecules. Else use euclidean
    distance. The specified `mol` must have a conformer. Atomic
    positions will be retrieved by calling `mol.getConformer(0)`.
  max_pair_distance: Optional[int], (default None)
    This value can be a positive integer or None. This
    parameter determines the maximum graph distance at which pair
    features are computed. For example, if `max_pair_distance==2`,
    then pair features are computed only for atoms at most graph
    distance 2 apart. If `max_pair_distance` is `None`, all pairs are
    considered (effectively infinite `max_pair_distance`)

  Note
  ----
@@ -433,32 +508,65 @@ def pair_features(mol, edge_list, canon_adj_list, bt_len=6,
  Returns
  -------
  features: np.ndarray
    Of shape `(N, N, bt_len + max_distance + 1)`. This is the array of pairwise
    features for all atom pairs.
    Of shape `(N_edges, bt_len + max_distance + 1)`. This is the array
    of pairwise features for all atom pairs, where N_edges is the
    number of edges within max_pair_distance of one another in this
    molecules.
  pair_edges: np.ndarray
    Of shape `(2, num_pairs)` where `num_pairs` is the total number of
    pairs within `max_pair_distance` of one another.
  """
  if graph_distance:
    max_distance = 7
  else:
    max_distance = 1
  N = mol.GetNumAtoms()
  features = np.zeros((N, N, bt_len + max_distance + 1))
  pair_edges = max_pair_distance_pairs(mol, max_pair_distance)
  num_pairs = pair_edges.shape[1]
  N_edges = pair_edges.shape[1]
  features = np.zeros((N_edges, bt_len + max_distance + 1))
  # Get mapping
  mapping = {}
  for n in range(N_edges):
    a1, a2 = pair_edges[:, n]
    mapping[(int(a1), int(a2))] = n
  num_atoms = mol.GetNumAtoms()
  rings = mol.GetRingInfo().AtomRings()
  for a1 in range(num_atoms):
    for a2 in canon_adj_list[a1]:
    for a2 in bond_adj_list[a1]:
      # first `bt_len` features are bond features(if applicable)
      features[a1, a2, :bt_len] = np.asarray(
          edge_list[tuple(sorted((a1, a2)))], dtype=float)
      if (int(a1), int(a2)) not in mapping:
        raise ValueError(
            "Malformed molecule with bonds not in specified graph distance.")
      else:
        n = mapping[(int(a1), int(a2))]
      features[n, :bt_len] = np.asarray(
          bond_features_map[tuple(sorted((a1, a2)))], dtype=float)
    for ring in rings:
      if a1 in ring:
        for a2 in ring:
          if (int(a1), int(a2)) not in mapping:
            # For ring pairs outside max pairs distance continue
            continue
          else:
            n = mapping[(int(a1), int(a2))]
          # `bt_len`-th feature is if the pair of atoms are in the same ring
        features[a1, ring, bt_len] = 1
        features[a1, a1, bt_len] = 0.
          if a2 == a1:
            features[n, bt_len] = 0
          else:
            features[n, bt_len] = 1
    # graph distance between two atoms
    if graph_distance:
      # distance is a matrix of 1-hot encoded distances for all atoms
      distance = find_distance(
          a1, num_atoms, canon_adj_list, max_distance=max_distance)
      features[a1, :, bt_len + 1:] = distance
          a1, num_atoms, bond_adj_list, max_distance=max_distance)
      for a2 in range(num_atoms):
        if (int(a1), int(a2)) not in mapping:
          # For ring pairs outside max pairs distance continue
          continue
        else:
          n = mapping[(int(a1), int(a2))]
          features[n, bt_len + 1:] = distance[a2]
  # Euclidean distance between atoms
  if not graph_distance:
    coords = np.zeros((N, 3))
@@ -469,10 +577,11 @@ def pair_features(mol, edge_list, canon_adj_list, bt_len=6,
      np.stack([coords] * N, axis=1) - \
      np.stack([coords] * N, axis=0)), axis=2))

  return features
  return features, pair_edges


def find_distance(a1, num_atoms, canon_adj_list, max_distance=7):
def find_distance(a1: RDKitAtom, num_atoms: int, bond_adj_list,
                  max_distance=7) -> np.ndarray:
  """Computes distances from provided atom.

  Parameters
@@ -481,10 +590,10 @@ def find_distance(a1, num_atoms, canon_adj_list, max_distance=7):
    The source atom to compute distances from.
  num_atoms: int
    The total number of atoms.
  canon_adj_list: list of lists
    `canon_adj_list[i]` is a list of the atom indices that atom `i` shares a
    list. This list is symmetrical so if `j in canon_adj_list[i]` then `i in
    canon_adj_list[j]`.
  bond_adj_list: list of lists
    `bond_adj_list[i]` is a list of the atom indices that atom `i` shares a
    bond with. This list is symmetrical so if `j in bond_adj_list[i]` then `i in
    bond_adj_list[j]`.
  max_distance: int, optional (default 7)
    The max distance to search.

@@ -498,7 +607,7 @@ def find_distance(a1, num_atoms, canon_adj_list, max_distance=7):
  distance = np.zeros((num_atoms, max_distance))
  radial = 0
  # atoms `radial` bonds away from `a1`
  adj_list = set(canon_adj_list[a1])
  adj_list = set(bond_adj_list[a1])
  # atoms less than `radial` bonds away
  all_list = set([a1])
  while radial < max_distance:
@@ -507,7 +616,7 @@ def find_distance(a1, num_atoms, canon_adj_list, max_distance=7):
    # find atoms `radial`+1 bonds away
    next_adj = set()
    for adj in adj_list:
      next_adj.update(canon_adj_list[adj])
      next_adj.update(bond_adj_list[adj])
    adj_list = next_adj - all_list
    radial = radial + 1
  return distance
@@ -647,6 +756,14 @@ class WeaveFeaturizer(MolecularFeaturizer):
  descriptors for each pair of atoms. These extra descriptors may provide for
  additional descriptive power but at the cost of a larger featurized dataset.


  Examples
  --------
  >>> import deepchem as dc
  >>> mols = ["C", "CCC"]
  >>> featurizer = dc.feat.WeaveFeaturizer()
  >>> X = featurizer.featurize(mols)

  References
  ----------
  .. [1] Kearnes, Steven, et al. "Molecular graph convolutions: moving beyond
@@ -660,18 +777,31 @@ class WeaveFeaturizer(MolecularFeaturizer):

  name = ['weave_mol']

  def __init__(self, graph_distance=True, explicit_H=False,
               use_chirality=False):
    """
  def __init__(self,
               graph_distance: bool = True,
               explicit_H: bool = False,
               use_chirality: bool = False,
               max_pair_distance: Optional[int] = None):
    """Initialize this featurizer with set parameters.

    Parameters
    ----------
    graph_distance: bool, optional
      If true, use graph distance. Otherwise, use Euclidean
      distance.
    explicit_H: bool, optional
    graph_distance: bool, (default True)
      If True, use graph distance for distance features. Otherwise, use
      Euclidean distance. Note that this means that molecules that this
      featurizer is invoked on must have valid conformer information if this
      option is set.
    explicit_H: bool, (default False) 
      If true, model hydrogens in the molecule.
    use_chirality: bool, optional
    use_chirality: bool, (default False)
      If true, use chiral information in the featurization
    max_pair_distance: Optional[int], (default None)
      This value can be a positive integer or None. This
      parameter determines the maximum graph distance at which pair
      features are computed. For example, if `max_pair_distance==2`,
      then pair features are computed only for atoms at most graph
      distance 2 apart. If `max_pair_distance` is `None`, all pairs are
      considered (effectively infinite `max_pair_distance`)
    """
    # Distance is either graph distance(True) or Euclidean distance(False,
    # only support datasets providing Cartesian coordinates)
@@ -682,9 +812,13 @@ class WeaveFeaturizer(MolecularFeaturizer):
    self.explicit_H = explicit_H
    # If uses use_chirality
    self.use_chirality = use_chirality
    if isinstance(max_pair_distance, int) and max_pair_distance <= 0:
      raise ValueError(
          "max_pair_distance must either be a positive integer or None")
    self.max_pair_distance = max_pair_distance
    if self.use_chirality:
      self.bt_len = int(
          GraphConvConstants.bond_fdim_base) + len(possible_bond_stereo)
      self.bt_len = int(GraphConvConstants.bond_fdim_base) + len(
          GraphConvConstants.possible_bond_stereo)
    else:
      self.bt_len = int(GraphConvConstants.bond_fdim_base)

@@ -704,27 +838,28 @@ class WeaveFeaturizer(MolecularFeaturizer):
    nodes = np.vstack(nodes)

    # Get bond lists
    edge_list = {}
    bond_features_map = {}
    for b in mol.GetBonds():
      edge_list[tuple(sorted([b.GetBeginAtomIdx(),
      bond_features_map[tuple(sorted([b.GetBeginAtomIdx(),
                                      b.GetEndAtomIdx()]))] = bond_features(
                                          b, use_chirality=self.use_chirality)

    # Get canonical adjacency list
    canon_adj_list = [[] for mol_id in range(len(nodes))]
    for edge in edge_list.keys():
      canon_adj_list[edge[0]].append(edge[1])
      canon_adj_list[edge[1]].append(edge[0])
    bond_adj_list = [[] for mol_id in range(len(nodes))]
    for bond in bond_features_map.keys():
      bond_adj_list[bond[0]].append(bond[1])
      bond_adj_list[bond[1]].append(bond[0])

    # Calculate pair features
    pairs = pair_features(
    pairs, pair_edges = pair_features(
        mol,
        edge_list,
        canon_adj_list,
        bond_features_map,
        bond_adj_list,
        bt_len=self.bt_len,
        graph_distance=self.graph_distance)
        graph_distance=self.graph_distance,
        max_pair_distance=self.max_pair_distance)

    return WeaveMol(nodes, pairs)
    return WeaveMol(nodes, pairs, pair_edges)


class AtomicConvFeaturizer(ComplexNeighborListFragmentAtomicCoordinates):
Loading