Unverified Commit a1d38afb authored by Nathan Frey's avatar Nathan Frey Committed by GitHub
Browse files

Merge pull request #2423 from ncfrey/rdkitgridfeaturizer-refactor

RdkitGridFeaturizer test fixes 🐛 
parents 1af20372 82276923
Loading
Loading
Loading
Loading
+71 −172
Original line number Diff line number Diff line
@@ -3,9 +3,9 @@ import time

from deepchem.utils.rdkit_utils import MoleculeLoadException, load_molecule, compute_ecfp_features
from deepchem.utils.geometry_utils import rotate_molecules, compute_pairwise_distances, compute_centroid, subtract_centroid
from deepchem.utils.hash_utils import hash_ecfp, hash_ecfp_pair, hash_sybyl
from deepchem.utils.noncovalent_utils import compute_pi_stack, compute_hydrogen_bonds, compute_salt_bridges, compute_binding_pocket_cation_pi
from deepchem.utils.voxel_utils import convert_atom_to_voxel, convert_atom_pair_to_voxel
from deepchem.utils.hash_utils import hash_ecfp, hash_ecfp_pair, hash_sybyl, vectorize
from deepchem.utils.noncovalent_utils import compute_hydrogen_bonds, compute_salt_bridges, compute_binding_pocket_cation_pi
from deepchem.utils.voxel_utils import convert_atom_to_voxel, convert_atom_pair_to_voxel, voxelize, voxelize_pi_stack

from deepchem.feat.complex_featurizers.contact_fingerprints import featurize_contacts_ecfp, featurize_binding_pocket_sybyl
from deepchem.feat.complex_featurizers.splif_fingerprints import featurize_splif
@@ -224,8 +224,7 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
      return [compute_ecfp_features(lig_rdk, self.ecfp_degree, self.ecfp_power)]
    if feature_name == 'ecfp_hashed':
      return [
          self._vectorize(
              hash_ecfp, feature_dict=ecfp_dict, channel_power=self.ecfp_power)
          vectorize(hash_ecfp, feature_dict=ecfp_dict, size=2**self.ecfp_power)
          for ecfp_dict in featurize_contacts_ecfp(
              (prot_xyz, prot_rdk), (lig_xyz, lig_rdk),
              distances,
@@ -234,18 +233,15 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
      ]
    if feature_name == 'splif_hashed':
      return [
          self._vectorize(
              hash_ecfp_pair,
              feature_dict=splif_dict,
              channel_power=self.splif_power)
          vectorize(
              hash_ecfp_pair, feature_dict=splif_dict, size=2**self.splif_power)
          for splif_dict in featurize_splif((prot_xyz, prot_rdk), (
              lig_xyz, lig_rdk
          ), self.cutoffs['splif_contact_bins'], distances, self.ecfp_degree)
      ]
    if feature_name == 'hbond_count':
      return [
          self._vectorize(
              hash_ecfp_pair, feature_list=hbond_list, channel_power=0)
          vectorize(hash_ecfp_pair, feature_list=hbond_list, size=2**0)
          for hbond_list in compute_hydrogen_bonds((prot_xyz, prot_rdk), (
              lig_xyz, lig_rdk), distances, self.cutoffs[
                  'hbond_dist_bins'], self.cutoffs['hbond_angle_cutoffs'])
@@ -253,12 +249,15 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
    if feature_name == 'ecfp':
      return [
          sum([
              self._voxelize(
              voxelize(
                  convert_atom_to_voxel,
                  hash_ecfp,
                  xyz,
                  box_width=self.box_width,
                  voxel_width=self.voxel_width,
                  hash_function=hash_ecfp,
                  feature_dict=ecfp_dict,
                  channel_power=self.ecfp_power)
                  nb_channel=2**self.ecfp_power,
              )
              for xyz, ecfp_dict in zip((prot_xyz, lig_xyz),
                                        featurize_contacts_ecfp(
                                            (prot_xyz, prot_rdk), (lig_xyz,
@@ -270,24 +269,33 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
      ]
    if feature_name == 'splif':
      return [
          self._voxelize(
          voxelize(
              convert_atom_pair_to_voxel,
              hash_ecfp_pair, (prot_xyz, lig_xyz),
              (prot_xyz, lig_xyz),
              box_width=self.box_width,
              voxel_width=self.voxel_width,
              hash_function=hash_ecfp_pair,
              feature_dict=splif_dict,
              channel_power=self.splif_power)
          for splif_dict in featurize_splif((prot_xyz, prot_rdk), (
              nb_channel=2**self.splif_power,
          ) for splif_dict in featurize_splif((prot_xyz, prot_rdk), (
              lig_xyz, lig_rdk
          ), self.cutoffs['splif_contact_bins'], distances, self.ecfp_degree)
      ]
    if feature_name == 'sybyl':

      def hash_sybyl_func(x):
        hash_sybyl(x, sybyl_types=self.sybyl_types)

      return [
          self._voxelize(
          voxelize(
              convert_atom_to_voxel,
              lambda x: hash_sybyl(x, sybyl_types=self.sybyl_types),
              xyz,
              box_width=self.box_width,
              voxel_width=self.voxel_width,
              hash_function=hash_sybyl_func,
              feature_dict=sybyl_dict,
              nb_channel=len(self.sybyl_types))
          for xyz, sybyl_dict in zip((prot_xyz, lig_xyz),
              nb_channel=len(self.sybyl_types),
          ) for xyz, sybyl_dict in zip((prot_xyz, lig_xyz),
                                       featurize_binding_pocket_sybyl(
                                           prot_xyz,
                                           prot_rdk,
@@ -298,25 +306,27 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
      ]
    if feature_name == 'salt_bridge':
      return [
          self._voxelize(
          voxelize(
              convert_atom_pair_to_voxel,
              None, (prot_xyz, lig_xyz),
              (prot_xyz, lig_xyz),
              box_width=self.box_width,
              voxel_width=self.voxel_width,
              feature_list=compute_salt_bridges(
                  prot_xyz,
                  prot_rdk,
                  lig_xyz,
                  lig_rdk,
                  distances,
                  cutoff=self.cutoffs['salt_bridges_cutoff']),
              nb_channel=1)
              nb_channel=1,
          )
      ]
    if feature_name == 'charge':
      return [
          sum([
              self._voxelize(
              voxelize(
                  convert_atom_to_voxel,
                  None,
                  xyz,
                  box_width=self.box_width,
                  voxel_width=self.voxel_width,
                  feature_dict=compute_charge_dictionary(mol),
                  nb_channel=1,
                  dtype="np.float16")
@@ -325,27 +335,33 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
      ]
    if feature_name == 'hbond':
      return [
          self._voxelize(
          voxelize(
              convert_atom_pair_to_voxel,
              None, (prot_xyz, lig_xyz),
              (prot_xyz, lig_xyz),
              box_width=self.box_width,
              voxel_width=self.voxel_width,
              feature_list=hbond_list,
              channel_power=0)
          for hbond_list in compute_hydrogen_bonds((prot_xyz, prot_rdk), (
              nb_channel=2**0,
          ) for hbond_list in compute_hydrogen_bonds((prot_xyz, prot_rdk), (
              lig_xyz, lig_rdk), distances, self.cutoffs[
                  'hbond_dist_bins'], self.cutoffs['hbond_angle_cutoffs'])
      ]
    if feature_name == 'pi_stack':
      return self._voxelize_pi_stack(prot_xyz, prot_rdk, lig_xyz, lig_rdk,
                                     distances)
      return voxelize_pi_stack(prot_xyz, prot_rdk, lig_xyz, lig_rdk, distances,
                               self.cutoffs['pi_stack_dist_cutoff'],
                               self.cutoffs['pi_stack_angle_cutoff'],
                               self.box_width, self.voxel_width)
    if feature_name == 'cation_pi':
      return [
          sum([
              self._voxelize(
              voxelize(
                  convert_atom_to_voxel,
                  None,
                  xyz,
                  box_width=self.box_width,
                  voxel_width=self.voxel_width,
                  feature_dict=cation_pi_dict,
                  nb_channel=1) for xyz, cation_pi_dict in zip(
                  nb_channel=1,
              ) for xyz, cation_pi_dict in zip(
                  (prot_xyz, lig_xyz),
                  compute_binding_pocket_cation_pi(
                      prot_rdk,
@@ -364,10 +380,11 @@ class RdkitGridFeaturizer(ComplexFeaturizer):

    This function then computes the centroid of the ligand; decrements this
    centroid from the atomic coordinates of protein and ligand atoms, and then
    merges the translated protein and ligand. This combined system/complex is then
    saved.
    merges the translated protein and ligand. This combined system/complex is
    then saved.

    This function then computes a featurization with scheme specified by the user.

    Parameters
    ----------
    complex: Tuple[str, str]
@@ -435,121 +452,3 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
    # TODO(rbharath): Is this squeeze OK?
    features = np.squeeze(np.array(list(features_dict.values())))
    return features

  def _voxelize(self,
                get_voxels,
                hash_function,
                coordinates,
                feature_dict=None,
                feature_list=None,
                channel_power=None,
                nb_channel=16,
                dtype="np.int8"):
    """Private helper function to voxelize inputs.

    Parameters
    ----------
    get_voxels: function
      Function that voxelizes inputs
    hash_function: function
      Used to map feature choices to voxel channels.
    coordinates: np.ndarray
      Contains the 3D coordinates of a molecular system.
    feature_dict: Dictionary
      Keys are atom indices.
    feature_list: list
      List of available features.
    channel_power: int
      If specified, nb_channel is set to 2**channel_power.
      TODO: This feels like a redundant parameter.
    nb_channel: int
      The number of feature channels computed per voxel.
    dtype: type
      The dtype of the numpy ndarray created to hold features.
    """

    if channel_power is not None:
      if channel_power == 0:
        nb_channel = 1
      else:
        nb_channel = int(2**channel_power)
    if dtype == "np.int8":
      feature_tensor = np.zeros(
          (self.voxels_per_edge, self.voxels_per_edge, self.voxels_per_edge,
           nb_channel),
          dtype=np.int8)
    else:
      feature_tensor = np.zeros(
          (self.voxels_per_edge, self.voxels_per_edge, self.voxels_per_edge,
           nb_channel),
          dtype=np.float16)
    if feature_dict is not None:
      for key, features in feature_dict.items():
        voxels = get_voxels(coordinates, key, self.box_width, self.voxel_width)
        for voxel in voxels:
          if ((voxel >= 0) & (voxel < self.voxels_per_edge)).all():
            if hash_function is not None:
              feature_tensor[voxel[0], voxel[1], voxel[2],
                             hash_function(features, channel_power)] += 1.0
            else:
              feature_tensor[voxel[0], voxel[1], voxel[2], 0] += features
    elif feature_list is not None:
      for key in feature_list:
        voxels = get_voxels(coordinates, key, self.box_width, self.voxel_width)
        for voxel in voxels:
          if ((voxel >= 0) & (voxel < self.voxels_per_edge)).all():
            feature_tensor[voxel[0], voxel[1], voxel[2], 0] += 1.0

    return feature_tensor

  def _voxelize_pi_stack(self, prot_xyz, prot_rdk, lig_xyz, lig_rdk, distances):
    protein_pi_t, protein_pi_parallel, ligand_pi_t, ligand_pi_parallel = (
        compute_pi_stack(
            prot_rdk,
            lig_rdk,
            distances,
            dist_cutoff=self.cutoffs['pi_stack_dist_cutoff'],
            angle_cutoff=self.cutoffs['pi_stack_angle_cutoff']))
    pi_parallel_tensor = self._voxelize(
        convert_atom_to_voxel,
        None,
        prot_xyz,
        feature_dict=protein_pi_parallel,
        nb_channel=1)
    pi_parallel_tensor += self._voxelize(
        convert_atom_to_voxel,
        None,
        lig_xyz,
        feature_dict=ligand_pi_parallel,
        nb_channel=1)

    pi_t_tensor = self._voxelize(
        convert_atom_to_voxel,
        None,
        prot_xyz,
        feature_dict=protein_pi_t,
        nb_channel=1)
    pi_t_tensor += self._voxelize(
        convert_atom_to_voxel,
        None,
        lig_xyz,
        feature_dict=ligand_pi_t,
        nb_channel=1)
    return [pi_parallel_tensor, pi_t_tensor]

  def _vectorize(self,
                 hash_function,
                 feature_dict=None,
                 feature_list=None,
                 channel_power=10):
    feature_vector = np.zeros(2**channel_power)
    if feature_dict is not None:
      on_channels = [
          hash_function(feature, channel_power)
          for key, feature in feature_dict.items()
      ]
      feature_vector[on_channels] += 1
    elif feature_list is not None:
      feature_vector[0] += len(feature_list)

    return feature_vector
+20 −17
Original line number Diff line number Diff line
@@ -5,9 +5,8 @@ import os
import unittest

import numpy as np
import pytest

from deepchem.feat.complex_featurizers import rdkit_grid_featurizer as rgf
from deepchem.feat import RdkitGridFeaturizer

np.random.seed(123)

@@ -28,15 +27,15 @@ class TestRdkitGridFeaturizer(unittest.TestCase):

  def test_default_featurizer(self):
    # test if default parameters work
    featurizer = rgf.RdkitGridFeaturizer()
    self.assertIsInstance(featurizer, rgf.RdkitGridFeaturizer)
    featurizer = RdkitGridFeaturizer()
    self.assertIsInstance(featurizer, RdkitGridFeaturizer)
    feature_tensor = featurizer.featurize([(self.ligand_file,
                                            self.protein_file)])
    self.assertIsInstance(feature_tensor, np.ndarray)

  def test_example_featurizer(self):
    # check if use-case from examples works
    featurizer = rgf.RdkitGridFeaturizer(
    featurizer = RdkitGridFeaturizer(
        voxel_width=16.0,
        feature_types=['ecfp', 'splif', 'hbond', 'salt_bridge'],
        ecfp_power=9,
@@ -48,7 +47,7 @@ class TestRdkitGridFeaturizer(unittest.TestCase):

  def test_force_flatten(self):
    # test if input is flattened when flat features are used
    featurizer = rgf.RdkitGridFeaturizer(
    featurizer = RdkitGridFeaturizer(
        feature_types=['ecfp_hashed'], flatten=False)
    featurizer.flatten = True  # False should be ignored with ecfp_hashed
    feature_tensor = featurizer.featurize([(self.ligand_file,
@@ -56,14 +55,17 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
    self.assertIsInstance(feature_tensor, np.ndarray)
    self.assertEqual(feature_tensor.shape, (1, 2 * 2**featurizer.ecfp_power))

  @pytest.mark.skip(reason="TODO: debug")
  def test_combined(self):
    ecfp_power = 5
    splif_power = 5
    box_width = 75.0
    voxel_width = 1.0
    voxels_per_edge = int(box_width / voxel_width)

    # test voxel features
    featurizer = rgf.RdkitGridFeaturizer(
        voxel_width=1.0,
        box_width=20.0,
    featurizer = RdkitGridFeaturizer(
        voxel_width=voxel_width,
        box_width=box_width,
        feature_types=['voxel_combined'],
        ecfp_power=ecfp_power,
        splif_power=splif_power,
@@ -76,10 +78,12 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
        2**ecfp_power +
        len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power + len(
            featurizer.cutoffs['hbond_dist_bins']) + 5)
    self.assertEqual(feature_tensor.shape, (1, 20, 20, 20, voxel_total_len))
    self.assertEqual(
        feature_tensor.shape,
        (1, voxels_per_edge, voxels_per_edge, voxels_per_edge, voxel_total_len))

    # test flat features
    featurizer = rgf.RdkitGridFeaturizer(
    featurizer = RdkitGridFeaturizer(
        voxel_width=1.0,
        feature_types=['flat_combined'],
        ecfp_power=ecfp_power,
@@ -94,8 +98,8 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
            featurizer.cutoffs['hbond_dist_bins']))
    self.assertEqual(feature_tensor.shape, (1, flat_total_len))

    # check if aromatic features are ignores if sanitize=False
    featurizer = rgf.RdkitGridFeaturizer(
    # check if aromatic features are ignored if sanitize=False
    featurizer = RdkitGridFeaturizer(
        voxel_width=16.0,
        feature_types=['all_combined'],
        ecfp_power=ecfp_power,
@@ -124,12 +128,11 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
        'cation_pi_dist_cutoff': 5.5,
        'cation_pi_angle_cutoff': 20.0,
    }
    rgf_featurizer = rgf.RdkitGridFeaturizer(**custom_cutoffs)
    rgf_featurizer = RdkitGridFeaturizer(**custom_cutoffs)
    self.assertEqual(rgf_featurizer.cutoffs, custom_cutoffs)

  @pytest.mark.skip(reason="TODO: debug")
  def test_rotations(self):
    featurizer = rgf.RdkitGridFeaturizer(
    featurizer = RdkitGridFeaturizer(
        nb_rotations=3,
        feature_types=['voxel_combined'],
        flatten=False,
+8 −3
Original line number Diff line number Diff line
"""
Various utilities around hash functions.
"""
from typing import Callable, Dict, Optional, Tuple, Any
from typing import Callable, Dict, Optional, Tuple, Any, List
import numpy as np
import hashlib

@@ -69,7 +69,8 @@ def hash_ecfp_pair(ecfp_pair: Tuple[str, str], size: int = 1024) -> int:

def vectorize(hash_function: Callable[[Any, int], int],
              feature_dict: Optional[Dict[int, str]] = None,
              size: int = 1024) -> np.ndarray:
              size: int = 1024,
              feature_list: Optional[List] = None) -> np.ndarray:
  """Helper function to vectorize a spatial description from a hash.

  Hash functions are used to perform spatial featurizations in
@@ -89,8 +90,10 @@ def vectorize(hash_function: Callable[[Any, int], int],
    then hashed values must fall in range `[0, 1024)`.
  feature_dict: Dict, optional (default None)
    Maps unique keys to features computed.
  size: int, optional (default 1024)
  size: int (default 1024)
    Length of generated bit vector
  feature_list: List, optional (default None)
    List of features.

  Returns
  -------
@@ -103,5 +106,7 @@ def vectorize(hash_function: Callable[[Any, int], int],
        hash_function(feature, size) for key, feature in feature_dict.items()
    ]
    feature_vector[on_channels] += 1
  elif feature_list is not None:
    feature_vector[0] += len(feature_list)

  return feature_vector
+51 −0
Original line number Diff line number Diff line
@@ -5,6 +5,8 @@ import logging
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import numpy as np

from deepchem.utils.noncovalent_utils import compute_pi_stack

logger = logging.getLogger(__name__)


@@ -159,3 +161,52 @@ def voxelize(get_voxels: Callable[..., Any],
          feature_tensor[voxel[0], voxel[1], voxel[2], 0] += 1.0

  return feature_tensor


def voxelize_pi_stack(prot_xyz, prot_rdk, lig_xyz, lig_rdk, distances,
                      pi_stack_dist_cutoff, pi_stack_angle_cutoff, box_width,
                      voxel_width):
  protein_pi_t, protein_pi_parallel, ligand_pi_t, ligand_pi_parallel = (
      compute_pi_stack(
          prot_rdk,
          lig_rdk,
          distances,
          dist_cutoff=pi_stack_dist_cutoff,
          angle_cutoff=pi_stack_angle_cutoff))
  pi_parallel_tensor = voxelize(
      convert_atom_to_voxel,
      prot_xyz,
      box_width=box_width,
      voxel_width=voxel_width,
      feature_dict=protein_pi_parallel,
      nb_channel=1,
  )

  pi_parallel_tensor += voxelize(
      convert_atom_to_voxel,
      lig_xyz,
      box_width=box_width,
      voxel_width=voxel_width,
      feature_dict=ligand_pi_parallel,
      nb_channel=1,
  )

  pi_t_tensor = voxelize(
      convert_atom_to_voxel,
      prot_xyz,
      box_width=box_width,
      voxel_width=voxel_width,
      feature_dict=protein_pi_t,
      nb_channel=1,
  )

  pi_t_tensor += voxelize(
      convert_atom_to_voxel,
      lig_xyz,
      box_width=box_width,
      voxel_width=voxel_width,
      feature_dict=ligand_pi_t,
      nb_channel=1,
  )

  return [pi_parallel_tensor, pi_t_tensor]