Commit 62dcf20f authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Changes

parent f5595b60
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -175,10 +175,10 @@ class NeighborListComplexAtomicCoordinates(ComplexFeaturizer):

    Parameters
    ----------
    mol_pdb: list
      Should be a list of lines of the PDB file.
    complex_pdb: list
      Should be a list of lines of the PDB file.
    mol_pdb_file: Str 
      Filename for ligand pdb file. 
    protein_pdb_file: Str 
      Filename for protein pdb file. 
    """
    mol_coords, ob_mol = rdkit_util.load_molecule(mol_pdb_file)
    protein_coords, protein_mol = rdkit_util.load_molecule(protein_pdb_file)
+7 −5
Original line number Diff line number Diff line
"""
Feature calculations.
"""
import logging
import types
import numpy as np
from rdkit import Chem
@@ -12,7 +13,8 @@ __copyright__ = "Copyright 2014, Stanford University"
__license__ = "BSD 3-clause"


def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file):
def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message):
  logging.info(log_message)
  return featurizer._featurize_complex(mol_pdb_file, protein_pdb_file)


@@ -21,7 +23,7 @@ class ComplexFeaturizer(object):
  Abstract class for calculating features for mol/protein complexes.
  """

  def featurize_complexes(self, mol_files, protein_pdbs, log_every_n=1000):
  def featurize_complexes(self, mol_files, protein_pdbs):
    """
    Calculate features for mol/protein complexes.

@@ -42,10 +44,10 @@ class ComplexFeaturizer(object):
    pool = multiprocessing.Pool()
    results = []
    for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_pdbs)):
      log_message = "Featurizing %d / %d" % (
          i, len(mol_files)) if i % log_every_n == 0 else None
      log_message = "Featurizing %d / %d" % (i, len(mol_files))
      results.append(
          pool.apply_async(_featurize_complex, (self, mol_file, protein_pdb)))
          pool.apply_async(_featurize_complex,
                           (self, mol_file, protein_pdb, log_message)))
    pool.close()
    features = []
    failures = []
+16 −103
Original line number Diff line number Diff line
@@ -29,20 +29,6 @@ TODO(LESWING) add sanitization with rdkit upgrade to 2017.*
"""


def get_ligand_filetype(ligand_filename):
  """Returns the filetype of ligand."""
  if ".mol2" in ligand_filename:
    return "mol2"
  elif ".sdf" in ligand_filename:
    return "sdf"
  elif ".pdbqt" in ligand_filename:
    return "pdbqt"
  elif ".pdb" in ligand_filename:
    return "pdb"
  else:
    raise ValueError("Unrecognized_filename")


def compute_centroid(coordinates):
  """Compute the x,y,z centroid of provided coordinates

@@ -1228,85 +1214,10 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
      ]
    raise ValueError('Unknown feature type "%s"' % feature_name)

  def _featurize_complex(self, ligand_ext, ligand_lines, protein_pdb_lines,
                         log_message):
    tempdir = tempfile.mkdtemp()
    if log_message is not None:
      log(log_message)
  def _featurize_complex(self, mol_pdb_file, protein_pdb_file):
    """Computes grid featurization of protein/ligand complex.

    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING
    ligand_file = os.path.join(tempdir, "ligand.%s" % ligand_ext)
    with open(ligand_file, "w") as mol_f:
      mol_f.writelines(ligand_lines)
    ############################################################## TIMING
    time2 = time.time()
    log("TIMING: Writing ligand took %0.3f s" % (time2 - time1), self.verbose)
    ############################################################## TIMING

    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING
    protein_pdb_file = os.path.join(tempdir, "protein.pdb")
    with open(protein_pdb_file, "w") as protein_f:
      protein_f.writelines(protein_pdb_lines)
    ############################################################## TIMING
    time2 = time.time()
    log("TIMING: Writing protein took %0.3f s" % (time2 - time1), self.verbose)
    ############################################################## TIMING

    features_dict = self._transform(protein_pdb_file, ligand_file)
    shutil.rmtree(tempdir)
    # Handle case the transform failed
    if features_dict is not None:
      return list(features_dict.values())
    else:
      return None

  def featurize_complexes(self, mol_files, protein_pdbs, log_every_n=1000):
    """
    Calculate features for mol/protein complexes.

    Parameters
    ----------
    mols: list
      List of PDB filenames for molecules.
    protein_pdbs: list
      List of PDB filenames for proteins.
    """
    pool = multiprocessing.Pool()
    results = []
    for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_pdbs)):
      log_message = "Featurizing %d / %d" % (
          i, len(mol_files)) if i % log_every_n == 0 else None
      ligand_ext = get_ligand_filetype(mol_file)
      with open(mol_file) as mol_f:
        mol_lines = mol_f.readlines()
      with open(protein_pdb) as protein_file:
        protein_pdb_lines = protein_file.readlines()
      results.append(
          pool.apply_async(
              _featurize_complex,
              (self, ligand_ext, mol_lines, protein_pdb_lines, log_message)))
    pool.close()
    features = []
    failures = []
    for ind, result in enumerate(results):
      new_features = result.get()
      # Handle loading failures which return None
      if new_features is not None:
        features.append(new_features)
      else:
        failures.append(ind)
    features = np.asarray(features)
    return features, failures

  def _transform(self, protein_pdb, ligand_file):
    """Computes featurization of protein/ligand complex.

    Takes as input files (strings) for pdb of the protein, pdb of the ligand,
    and a directory to save intermediate files.
    Takes as input filenames pdb of the protein, pdb of the ligand.

    This function then computes the centroid of the ligand; decrements this
    centroid from the atomic coordinates of protein and ligand atoms, and then
@@ -1314,6 +1225,12 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
    saved.

    This function then computes a featurization with scheme specified by the user.
    Parameters
    ----------
    mol_pdb_file: Str 
      Filename for ligand pdb file. 
    protein_pdb_file: Str 
      Filename for protein pdb file. 
    """
    try:
      ############################################################## TIMING
@@ -1321,7 +1238,7 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
      ############################################################## TIMING

      protein_xyz, protein_rdk = load_molecule(
          protein_pdb, calc_charges=True, sanitize=self.sanitize)
          protein_pdb_file, calc_charges=True, sanitize=self.sanitize)
      ############################################################## TIMING
      time2 = time.time()
      log("TIMING: Loading protein coordinates took %0.3f s" % (time2 - time1),
@@ -1331,7 +1248,7 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
      time1 = time.time()
      ############################################################## TIMING
      ligand_xyz, ligand_rdk = load_molecule(
          ligand_file, calc_charges=True, sanitize=self.sanitize)
          mol_pdb_file, calc_charges=True, sanitize=self.sanitize)
      ############################################################## TIMING
      time2 = time.time()
      log("TIMING: Loading ligand coordinates took %0.3f s" % (time2 - time1),
@@ -1362,7 +1279,7 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
      rotated_system = rotate_molecules([protein_xyz, ligand_xyz])
      transformed_systems[(i + 1, 0)] = rotated_system

    features = {}
    features_dict = {}
    for system_id, (protein_xyz, ligand_xyz) in transformed_systems.items():
      feature_arrays = []
      for is_flat, function_name in self.feature_types:
@@ -1378,11 +1295,13 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
        feature_arrays += result

        if self.flatten:
          features[system_id] = np.concatenate(
          features_dict[system_id] = np.concatenate(
              [feature_array.flatten() for feature_array in feature_arrays])
        else:
          features[system_id] = np.concatenate(feature_arrays, axis=-1)
          features_dict[system_id] = np.concatenate(feature_arrays, axis=-1)

    # TODO(rbharath): Is this squeeze OK?
    features = np.squeeze(np.array(list(features_dict.values())))
    return features

  def _voxelize(self,
@@ -1480,9 +1399,3 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
      feature_vector[0] += len(feature_list)

    return feature_vector


def _featurize_complex(featurizer, ligand_ext, ligand_lines, protein_pdb_lines,
                       log_message):
  return featurizer._featurize_complex(ligand_ext, ligand_lines,
                                       protein_pdb_lines, log_message)
+39 −34
Original line number Diff line number Diff line
@@ -33,19 +33,19 @@ class TestHelperFunctions(unittest.TestCase):
                                     '3ws9_protein_fixer_rdkit.pdb')
    self.ligand_file = os.path.join(current_dir, '3ws9_ligand.sdf')

  def test_get_ligand_filetype(self):
  #def test_get_ligand_filetype(self):

    supported_extensions = ['mol2', 'sdf', 'pdb', 'pdbqt']
    # some users might try to read smiles with this function
    unsupported_extensions = ['smi', 'ism']
  #  supported_extensions = ['mol2', 'sdf', 'pdb', 'pdbqt']
  #  # some users might try to read smiles with this function
  #  unsupported_extensions = ['smi', 'ism']

    for extension in supported_extensions:
      fname = 'molecule.%s' % extension
      self.assertEqual(rgf.get_ligand_filetype(fname), extension)
  #  for extension in supported_extensions:
  #    fname = 'molecule.%s' % extension
  #    self.assertEqual(rgf.get_ligand_filetype(fname), extension)

    for extension in unsupported_extensions:
      fname = 'molecule.%s' % extension
      self.assertRaises(ValueError, rgf.get_ligand_filetype, fname)
  #  for extension in unsupported_extensions:
  #    fname = 'molecule.%s' % extension
  #    self.assertRaises(ValueError, rgf.get_ligand_filetype, fname)

  def test_load_molecule(self):
    # adding hydrogens and charges is tested in dc.utils
@@ -367,7 +367,8 @@ class TestFeaturizationFunctions(unittest.TestCase):
        prot_xyz,
        prot_rdk,
        lig_xyz,
        lig_rdk,)
        lig_rdk,
    )
    prot_dict_dist, lig_dict_dist = rgf.featurize_binding_pocket_ecfp(
        prot_xyz, prot_rdk, lig_xyz, lig_rdk, pairwise_distances=distance)
    # ...but first check if we actually got two dicts
@@ -383,13 +384,15 @@ class TestFeaturizationFunctions(unittest.TestCase):
        prot_rdk,
        lig_xyz,
        lig_rdk,
        cutoff=2.0,)
        cutoff=2.0,
    )
    prot_dict_d6, lig_dict_d6 = rgf.featurize_binding_pocket_ecfp(
        prot_xyz,
        prot_rdk,
        lig_xyz,
        lig_rdk,
        cutoff=6.0,)
        cutoff=6.0,
    )
    self.assertLess(len(prot_dict_d2), len(prot_dict))
    # ligands are typically small so all atoms might be present
    self.assertLessEqual(len(lig_dict_d2), len(lig_dict))
@@ -402,7 +405,8 @@ class TestFeaturizationFunctions(unittest.TestCase):
        prot_rdk,
        lig_xyz,
        lig_rdk,
        ecfp_degree=3,)
        ecfp_degree=3,
    )
    self.assertNotEqual(prot_dict_e3, prot_dict)
    self.assertNotEqual(lig_dict_e3, lig_dict)

@@ -419,7 +423,8 @@ class TestFeaturizationFunctions(unittest.TestCase):
          prot_rdk,
          lig_rdk,
          distance,
          bins,)
          bins,
      )

      self.assertIsInstance(splif_dict, dict)
      for (prot_idx, lig_idx), ecfp_pair in splif_dict.items():
@@ -478,7 +483,7 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
    # test if default parameters work
    featurizer = rgf.RdkitGridFeaturizer()
    self.assertIsInstance(featurizer, rgf.RdkitGridFeaturizer)
    feature_tensor = featurizer.featurize_complexes([self.ligand_file],
    feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
                                                       [self.protein_file])
    self.assertIsInstance(feature_tensor, np.ndarray)

@@ -490,7 +495,7 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
        ecfp_power=9,
        splif_power=9,
        flatten=True)
    feature_tensor = featurizer.featurize_complexes([self.ligand_file],
    feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
                                                       [self.protein_file])
    self.assertIsInstance(feature_tensor, np.ndarray)

@@ -499,7 +504,7 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
    featurizer = rgf.RdkitGridFeaturizer(
        feature_types=['ecfp_hashed'], flatten=False)
    featurizer.flatten = True  # False should be ignored with ecfp_hashed
    feature_tensor = featurizer.featurize_complexes([self.ligand_file],
    feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
                                                       [self.protein_file])
    self.assertIsInstance(feature_tensor, np.ndarray)
    self.assertEqual(feature_tensor.shape, (1, 2 * 2**featurizer.ecfp_power))
@@ -516,13 +521,13 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
        splif_power=splif_power,
        flatten=False,
        sanitize=True)
    feature_tensor = featurizer.featurize_complexes([self.ligand_file],
    feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
                                                       [self.protein_file])
    self.assertIsInstance(feature_tensor, np.ndarray)
    voxel_total_len = (
        2**ecfp_power +
        len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power +
        len(featurizer.cutoffs['hbond_dist_bins']) + 5)
        len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power + len(
            featurizer.cutoffs['hbond_dist_bins']) + 5)
    self.assertEqual(feature_tensor.shape, (1, 20, 20, 20, voxel_total_len))

    # test flat features
@@ -532,13 +537,13 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
        ecfp_power=ecfp_power,
        splif_power=splif_power,
        sanitize=True)
    feature_tensor = featurizer.featurize_complexes([self.ligand_file],
    feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
                                                       [self.protein_file])
    self.assertIsInstance(feature_tensor, np.ndarray)
    flat_total_len = (
        3 * 2**ecfp_power +
        len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power +
        len(featurizer.cutoffs['hbond_dist_bins']))
        len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power + len(
            featurizer.cutoffs['hbond_dist_bins']))
    self.assertEqual(feature_tensor.shape, (1, flat_total_len))

    # check if aromatic features are ignores if sanitize=False
@@ -552,7 +557,7 @@ class TestRdkitGridFeaturizer(unittest.TestCase):

    self.assertTrue('pi_stack' not in featurizer.feature_types)
    self.assertTrue('cation_pi' not in featurizer.feature_types)
    feature_tensor = featurizer.featurize_complexes([self.ligand_file],
    feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
                                                       [self.protein_file])
    self.assertIsInstance(feature_tensor, np.ndarray)
    total_len = voxel_total_len + flat_total_len - 3 - 2**ecfp_power
@@ -580,9 +585,9 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
        feature_types=['voxel_combined'],
        flatten=False,
        sanitize=True)
    feature_tensors = featurizer.featurize_complexes([self.ligand_file],
    feature_tensors, _ = featurizer.featurize_complexes([self.ligand_file],
                                                        [self.protein_file])
    self.assertEqual(len(feature_tensors), 4)
    self.assertEqual(feature_tensors.shape, (1, 4, 16, 16, 16, 40))

  def test_voxelize(self):
    prot_xyz, prot_rdk = rgf.load_molecule(self.protein_file)
+5 −2
Original line number Diff line number Diff line
@@ -234,13 +234,16 @@ def load_pdbbind(featurizer="grid", split="random", subset="core", reload=True):
  else:
    raise ValueError("Featurizer not supported")
  print("Featurizing Complexes")
  features, failures = featurizer.featurize_complexes(
      ligand_files, protein_files, log_every_n=1)
  features, failures = featurizer.featurize_complexes(ligand_files,
                                                      protein_files)
  # Delete labels for failing elements
  labels = np.delete(labels, failures)
  dataset = deepchem.data.DiskDataset.from_numpy(features, labels)
  # No transformations of data
  transformers = []
  if split == None:
    return tasks, (dataset, None, None), transformers

  # TODO(rbharath): This should be modified to contain a cluster split so
  # structures of the same protein aren't in both train/test
  splitters = {