Commit ec86aaed authored by marta-sd's avatar marta-sd
Browse files

more cleanup:

* removed unused code
* default featurization changed to ecfp (ecfp_ligand needed sanitize=True)
* ignore features that cannot be calculated
* more tests
parent 1b5e9b48
Loading
Loading
Loading
Loading
+19 −71
Original line number Diff line number Diff line
@@ -41,13 +41,6 @@ def get_ligand_filetype(ligand_filename):
    raise ValueError("Unrecognized_filename")


def merge_two_dicts(x, y):
  """Given two dicts, merge them into a new dict as a shallow copy."""
  z = x.copy()
  z.update(y)
  return z


def compute_centroid(coordinates):
  """Compute the x,y,z centroid of provided coordinates

@@ -169,7 +162,7 @@ def angle_between(vector_i, vector_j):
  vector_j_u = unit_vector(vector_j)
  angle = np.arccos(np.dot(vector_i_u, vector_j_u))
  if np.isnan(angle):
    if (vector_i_u == vector_j_u).all():
    if np.allclose(vector_i_u, vector_j_u):
      return 0.0
    else:
      return np.pi
@@ -363,8 +356,6 @@ def featurize_splif(protein_xyz, protein, ligand_xyz, ligand, contact_bins,
  (protein_ecfp_i, ligand_ecfp_j) tuples. Return a list of such splif
  dictionaries.
  """
  if pairwise_distances is None:
    pairwise_distances = compute_pairwise_distances(protein_xyz, ligand_xyz)
  splif_dicts = []
  for i, contact_bin in enumerate(contact_bins):
    splif_dicts.append(
@@ -904,7 +895,6 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
               ecfp_degree=2,
               ecfp_power=3,
               splif_power=3,
               ligand_only=False,
               box_width=16.0,
               voxel_width=1.0,
               flatten=False,
@@ -915,7 +905,7 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
    -----------
    nb_rotations: int, optional (default 0)
      Number of additional random rotations of a complex to generate.
    feature_types: list, optional (default ['ecfp_ligand'])
    feature_types: list, optional (default ['ecfp'])
      Types of features to calculate. Available types are:
        flat features: 'ecfp_ligand', 'ecfp_hashed', 'splif_hashed', 'hbond_count'
        voxel features: 'ecfp', 'splif', 'sybyl', 'salt_bridge', 'charge', 'hbond',
@@ -932,8 +922,6 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
    splif_power: int, optional (default 3)
      Number of bits to store SPLIF features (resulting vector will be
      2^splif_power long)
    ligand_only: bool, optional (defaul False)
      Do not load protein. Can speed up computations when are used.
    box_width: float, optional (default 16.0)
      Size of a box in which voxel features are calculated. Box is centered on a
      ligand centroid.
@@ -944,6 +932,9 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
      flattened if flat features are specified in feature_types.
    verbose: bool, optional (defaul True)
      Verbolity for logging
    sanitize: bool, optional (defaul False)
      If set to True molecules will be sanitized. Note that calculating some
      features (e.g. aromatic interactions) require sanitized molecules.
    **kwargs: dict, optional
      Keyword arguments can be usaed to specify custom cutoffs and bins (see
      default values below).
@@ -969,7 +960,10 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
    ]

    # list of features that require sanitized molecules
    require_sanitized = ['pi_stack', 'cation_pi']
    require_sanitized = ['pi_stack', 'cation_pi', 'ecfp_ligand']

    # not implemented featurization types
    not_implemented = ['sybyl']

    for arg in deprecated_args:
      if arg in kwargs and verbose:
@@ -987,8 +981,6 @@ class RdkitGridFeaturizer(ComplexFeaturizer):

    self.nb_rotations = nb_rotations

    self.ligand_only = ligand_only

    # default values
    self.cutoffs = {
        'hbond_dist_bins': [(2.2, 2.5), (2.5, 3.2), (3.2, 4.0)],
@@ -1221,7 +1213,7 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
    }

    if feature_types is None:
      feature_types = ['ecfp_ligand']
      feature_types = ['ecfp']

    # each entry is a tuple (is_flat, feature_name)
    self.feature_types = []
@@ -1231,6 +1223,7 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
    ignored_features = []
    if self.sanitize is False:
      ignored_features += require_sanitized
    ignored_features += not_implemented

    # parse provided feature types
    for feature_type in feature_types:
@@ -1239,6 +1232,11 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
          warn('sanitize is set to False, %s feature will be ignored' %
               feature_type)
        continue
      if feature_type in not_implemented:
        if self.verbose:
          warn('%s feature is not implemented yet and will be ignored' %
               feature_type)
        continue

      if feature_type in self.FLAT_FEATURES:
        self.feature_types.append((True, feature_type))
@@ -1348,7 +1346,6 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
    time1 = time.time()
    ############################################################## TIMING

    if not self.ligand_only:
    protein_xyz, protein_rdk = load_molecule(
        protein_pdb, calc_charges=True, sanitize=self.sanitize)
    ############################################################## TIMING
@@ -1372,7 +1369,6 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
    ############################################################## TIMING
    centroid = compute_centroid(ligand_xyz)
    ligand_xyz = subtract_centroid(ligand_xyz, centroid)
    if not self.ligand_only:
    protein_xyz = subtract_centroid(protein_xyz, centroid)
    ############################################################## TIMING
    time2 = time.time()
@@ -1473,51 +1469,3 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
      feature_vector[0] += len(feature_list)

    return feature_vector

  def _compute_flat_features(self, protein_xyz, protein_ob, ligand_xyz,
                             ligand_ob):
    """Computes vectorial (as opposed to tensorial) featurization."""
    pairwise_distances = compute_pairwise_distances(protein_xyz, ligand_xyz)
    protein_ecfp_dict, ligand_ecfp_dict = featurize_binding_pocket_ecfp(
        protein_xyz,
        protein_ob,
        ligand_xyz,
        ligand_ob,
        pairwise_distances,
        cutoff=4.5,
        ecfp_degree=self.ecfp_degree)
    splif_dicts = featurize_splif(protein_xyz, protein_ob, ligand_xyz,
                                  ligand_ob, self.contact_bins,
                                  pairwise_distances, self.ecfp_degree)
    hbond_list = compute_hydrogen_bonds(
        protein_xyz, protein_ob, ligand_xyz, ligand_ob, pairwise_distances,
        self.hbond_dist_bins, self.hbond_angle_cutoffs)

    protein_ecfp_vector = [
        self._vectorize(
            hash_ecfp,
            feature_dict=protein_ecfp_dict,
            channel_power=self.ecfp_power)
    ]
    ligand_ecfp_vector = [
        self._vectorize(
            hash_ecfp,
            feature_dict=ligand_ecfp_dict,
            channel_power=self.ecfp_power)
    ]
    splif_vectors = [
        self._vectorize(
            hash_ecfp_pair,
            feature_dict=splif_dict,
            channel_power=self.splif_power) for splif_dict in splif_dicts
    ]
    hbond_vectors = [
        self._vectorize(
            hash_ecfp_pair, feature_list=hbond_list, channel_power=0)
        for hbond_class in hbond_list
    ]
    feature_vectors = protein_ecfp_vector + \
                      ligand_ecfp_vector + splif_vectors + hbond_vectors
    feature_vector = np.concatenate(feature_vectors, axis=0)

    return ({(0, 0): feature_vector})
+105 −12
Original line number Diff line number Diff line
@@ -115,6 +115,8 @@ class TestHelperFunctions(unittest.TestCase):
      angle = rgf.angle_between(v1, v2)
      self.assertLessEqual(angle, np.pi)
      self.assertGreaterEqual(angle, 0.0)
      self.assertAlmostEqual(rgf.angle_between(v1, v1), 0.0)
      self.assertAlmostEqual(rgf.angle_between(v1, -v1), np.pi)

  def test_hash_ecfp(self):
    for power in (2, 16, 64):
@@ -207,11 +209,15 @@ class TestPiInteractions(unittest.TestCase):
    # load and sanitize two real molecules
    _, self.prot = rgf.load_molecule(
        os.path.join(current_dir, '3ws9_protein_fixer_rdkit.pdb'),
        add_hydrogens=False, calc_charges=False, sanitize=True)
        add_hydrogens=False,
        calc_charges=False,
        sanitize=True)

    _, self.lig = rgf.load_molecule(
        os.path.join(current_dir, '3ws9_ligand.sdf'),
        add_hydrogens=False, calc_charges=False, sanitize=True)
        add_hydrogens=False,
        calc_charges=False,
        sanitize=True)

  def test_compute_ring_center(self):
    # FIXME might break with different version of rdkit
@@ -468,29 +474,116 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
    self.ligand_file = os.path.join(package_dir, 'dock', 'tests',
                                    '1jld_ligand.sdf')

  def test_featurizer(self):
  def test_default_featurizer(self):
    # test if default parameters work
    featurizer = rgf.RdkitGridFeaturizer()
    self.assertIsInstance(featurizer, rgf.RdkitGridFeaturizer)
    feature_tensor = featurizer.featurize_complexes([self.ligand_file],
                                                    [self.protein_file])
    self.assertIsInstance(feature_tensor, np.ndarray)

  def test_example_featurizer(self):
    # check if use-case from examples works
    featurizer = rgf.RdkitGridFeaturizer(
        voxel_width=16.0,
        feature_types=['ecfp', 'splif', 'hbond', 'salt_bridge'],
        ecfp_power=9,
        splif_power=9,
        flatten=True)
    feature_tensor = featurizer.featurize_complexes([self.ligand_file],
                                                    [self.protein_file])
    self.assertIsInstance(feature_tensor, np.ndarray)

  def test_force_flatten(self):
    # test if input is flattened when flat features are used
    featurizer = rgf.RdkitGridFeaturizer(
        feature_types=['ecfp_hashed'], flatten=False)
    featurizer.flatten = True  # False should be ignored with ecfp_hashed
    feature_tensor = featurizer.featurize_complexes([self.ligand_file],
                                                    [self.protein_file])
    self.assertIsInstance(feature_tensor, np.ndarray)
    self.assertEqual(feature_tensor.shape, (1, 2 * 2**featurizer.ecfp_power))

  def test_combined(self):
    ecfp_power = 5
    splif_power = 5
    # test voxel features
    featurizer = rgf.RdkitGridFeaturizer(
        voxel_width=1.0,
        box_width=20.0,
        feature_types=['voxel_combined'],
        ecfp_power=ecfp_power,
        splif_power=splif_power,
        flatten=False,
        sanitize=True)
    feature_tensor = featurizer.featurize_complexes([self.ligand_file],
                                                    [self.protein_file])
    self.assertIsInstance(feature_tensor, np.ndarray)
    voxel_total_len = (
        2**ecfp_power +
        len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power +
        len(featurizer.cutoffs['hbond_dist_bins']) + 5)
    self.assertEqual(feature_tensor.shape, (1, 20, 20, 20, voxel_total_len))

    # test flat features
    featurizer = rgf.RdkitGridFeaturizer(
        voxel_width=1.0,
        feature_types=['flat_combined'],
        ecfp_power=ecfp_power,
        splif_power=splif_power,
        sanitize=True)
    feature_tensor = featurizer.featurize_complexes([self.ligand_file],
                                                    [self.protein_file])
    self.assertIsInstance(feature_tensor, np.ndarray)
    flat_total_len = (
        3 * 2**ecfp_power +
        len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power +
        len(featurizer.cutoffs['hbond_dist_bins']))
    self.assertEqual(feature_tensor.shape, (1, flat_total_len))

    # just check if it works for the use-case from examples
    # check if aromatic features are ignores if sanitize=False
    featurizer = rgf.RdkitGridFeaturizer(
        voxel_width=16.0,
        feature_types=[
            'ecfp', 'splif', 'hbond', 'salt_bridge', 'pi_stack', 'cation_pi'
        ],
        feature_types=['all_combined'],
        ecfp_power=ecfp_power,
        splif_power=splif_power,
        flatten=True,
        sanitize=True)
    self.assertIsInstance(featurizer, rgf.RdkitGridFeaturizer)
        sanitize=False)

    self.assertTrue('pi_stack' not in featurizer.feature_types)
    self.assertTrue('cation_pi' not in featurizer.feature_types)
    feature_tensor = featurizer.featurize_complexes([self.ligand_file],
                                                    [self.protein_file])
    self.assertIsInstance(feature_tensor, np.ndarray)
    total_len = (2**ecfp_power +
                 len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power
                 + len(featurizer.cutoffs['hbond_dist_bins']) + 4)
    total_len = voxel_total_len + flat_total_len - 3 - 2**ecfp_power
    self.assertEqual(feature_tensor.shape, (1, total_len))

  def test_custom_cutoffs(self):
    custom_cutoffs = {
        'hbond_dist_bins': [(2., 3.), (3., 3.5)],
        'hbond_angle_cutoffs': [5, 90],
        'splif_contact_bins': [(0, 3.5), (3.5, 6.0)],
        'ecfp_cutoff': 5.0,
        'sybyl_cutoff': 3.0,
        'salt_bridges_cutoff': 4.0,
        'pi_stack_dist_cutoff': 5.0,
        'pi_stack_angle_cutoff': 15.0,
        'cation_pi_dist_cutoff': 5.5,
        'cation_pi_angle_cutoff': 20.0,
    }
    rgf_featurizer = rgf.RdkitGridFeaturizer(**custom_cutoffs)
    self.assertEqual(rgf_featurizer.cutoffs, custom_cutoffs)

  def test_rotations(self):
    featurizer = rgf.RdkitGridFeaturizer(
        nb_rotations=3,
        feature_types=['voxel_combined'],
        flatten=False,
        sanitize=True)
    feature_tensors = featurizer.featurize_complexes([self.ligand_file],
                                                     [self.protein_file])
    self.assertEqual(len(feature_tensors), 4)

  def test_voxelize(self):
    prot_xyz, prot_rdk = rgf.load_molecule(self.protein_file)
    lig_xyz, lig_rdk = rgf.load_molecule(self.ligand_file)