Unverified Commit b57a23c2 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1397 from rbharath/atomic_molnet

Molnet support for Atomic Conv Featurization
parents 587f5f09 60a06708
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -46,7 +46,7 @@ class GridPoseScorer(object):

  def score(self, protein_file, ligand_file):
    """Returns a score for a protein/ligand pair."""
    features = self.featurizer.featurize_complexes([ligand_file],
    features, _ = self.featurizer.featurize_complexes([ligand_file],
                                                      [protein_file])
    dataset = NumpyDataset(X=features, y=None, w=None, ids=None)
    score = self.model.predict(dataset)
+30 −12
Original line number Diff line number Diff line
@@ -8,10 +8,13 @@ __author__ = "Joseph Gomes and Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import logging
import numpy as np
from deepchem.utils.save import log
from deepchem.feat import Featurizer
from deepchem.feat import ComplexFeaturizer
from deepchem.utils import rdkit_util, pad_array
from deepchem.utils.rdkit_util import MoleculeLoadException


class AtomicCoordinates(Featurizer):
@@ -172,10 +175,10 @@ class NeighborListComplexAtomicCoordinates(ComplexFeaturizer):

    Parameters
    ----------
    mol_pdb: list
      Should be a list of lines of the PDB file.
    complex_pdb: list
      Should be a list of lines of the PDB file.
    mol_pdb_file: Str 
      Filename for ligand pdb file. 
    protein_pdb_file: Str 
      Filename for protein pdb file. 
    """
    mol_coords, ob_mol = rdkit_util.load_molecule(mol_pdb_file)
    protein_coords, protein_mol = rdkit_util.load_molecule(protein_pdb_file)
@@ -223,8 +226,14 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):
        self.max_num_neighbors, self.neighbor_cutoff)

  def _featurize_complex(self, mol_pdb_file, protein_pdb_file):
    try:
      frag1_coords, frag1_mol = rdkit_util.load_molecule(mol_pdb_file)
      frag2_coords, frag2_mol = rdkit_util.load_molecule(protein_pdb_file)
    except MoleculeLoadException:
      # Currently handles loading failures by returning None
      # TODO: Is there a better handling procedure?
      logging.warning("Some molecules cannot be loaded by Rdkit. Skipping")
      return None
    system_mol = rdkit_util.merge_molecules(frag1_mol, frag2_mol)
    system_coords = rdkit_util.get_xyz_from_mol(system_mol)

@@ -232,6 +241,7 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):
    frag2_coords, frag2_mol = self._strip_hydrogens(frag2_coords, frag2_mol)
    system_coords, system_mol = self._strip_hydrogens(system_coords, system_mol)

    try:
      frag1_coords, frag1_neighbor_list, frag1_z = self.featurize_mol(
          frag1_coords, frag1_mol, self.frag1_num_atoms)

@@ -240,15 +250,23 @@ class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):

      system_coords, system_neighbor_list, system_z = self.featurize_mol(
          system_coords, system_mol, self.complex_num_atoms)
    except ValueError as e:
      logging.warning(
          "max_atoms was set too low. Some complexes too large and skipped")
      return None

    return frag1_coords, frag1_neighbor_list, frag1_z, frag2_coords, frag2_neighbor_list, frag2_z, \
           system_coords, system_neighbor_list, system_z

  def get_Z_matrix(self, mol, max_atoms):
    if len(mol.GetAtoms()) > max_atoms:
      raise ValueError("A molecule is larger than permitted by max_atoms. "
                       "Increase max_atoms and try again.")
    return pad_array(
        np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]), max_atoms)

  def featurize_mol(self, coords, mol, max_num_atoms):
    logging.info("Featurizing molecule of size: %d", len(mol.GetAtoms()))
    neighbor_list = compute_neighbor_list(coords, self.neighbor_cutoff,
                                          self.max_num_neighbors, None)
    z = self.get_Z_matrix(mol, max_num_atoms)
+37 −13
Original line number Diff line number Diff line
"""
Feature calculations.
"""
import logging
import types
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdGeometry, rdMolTransforms
from deepchem.utils.save import log
import multiprocessing

__author__ = "Steven Kearnes"
__copyright__ = "Copyright 2014, Stanford University"
__license__ = "BSD 3-clause"


def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message):
  logging.info(log_message)
  return featurizer._featurize_complex(mol_pdb_file, protein_pdb_file)


class ComplexFeaturizer(object):
  """"
  Abstract class for calculating features for mol/protein complexes.
  """

  def featurize_complexes(self, mol_pdbs, protein_pdbs, verbose=True,
                          log_every_n=1000):
  def featurize_complexes(self, mol_files, protein_pdbs):
    """
    Calculate features for mol/protein complexes.

    Parameters
    ----------
    mol_pdbs: list
      List of PDBs for molecules. Each PDB should be a list of lines of the
      PDB file.
    mols: list
      List of PDB filenames for molecules.
    protein_pdbs: list
      List of PDBs for proteins. Each PDB should be a list of lines of the
      PDB file.
      List of PDB filenames for proteins.

    Returns
    -------
    features: np.array
      Array of features
    failures: list
      Indices of complexes that failed to featurize.
    """
    pool = multiprocessing.Pool()
    results = []
    for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_pdbs)):
      log_message = "Featurizing %d / %d" % (i, len(mol_files))
      results.append(
          pool.apply_async(_featurize_complex,
                           (self, mol_file, protein_pdb, log_message)))
    pool.close()
    features = []
    for i, (mol_pdb, protein_pdb) in enumerate(zip(mol_pdbs, protein_pdbs)):
      if verbose and i % log_every_n == 0:
        log("Featurizing %d / %d" % (i, len(mol_pdbs)))
      features.append(self._featurize_complex(mol_pdb, protein_pdb))
    failures = []
    for ind, result in enumerate(results):
      new_features = result.get()
      # Handle loading failures which return None
      if new_features is not None:
        features.append(new_features)
      else:
        failures.append(ind)
    features = np.asarray(features)
    return features
    return features, failures

  def _featurize_complex(self, mol_pdb, complex_pdb):
    """
@@ -51,6 +74,7 @@ class ComplexFeaturizer(object):
    """
    raise NotImplementedError('Featurizer is not defined.')


class Featurizer(object):
  """
  Abstract class for calculating a set of features for a molecule.
+41 −112
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@ __author__ = "Bharath Ramsundar, Evan Feinberg, and Karl Leswing"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import logging
import os
import shutil
from warnings import warn
@@ -16,6 +17,7 @@ from collections import Counter
from rdkit import Chem
from rdkit.Chem import AllChem
from deepchem.utils.rdkit_util import load_molecule
from deepchem.utils.rdkit_util import MoleculeLoadException

import numpy as np
from scipy.spatial.distance import cdist
@@ -27,20 +29,6 @@ TODO(LESWING) add sanitization with rdkit upgrade to 2017.*
"""


def get_ligand_filetype(ligand_filename):
  """Returns the filetype of ligand."""
  if ".mol2" in ligand_filename:
    return "mol2"
  elif ".sdf" in ligand_filename:
    return "sdf"
  elif ".pdbqt" in ligand_filename:
    return "pdbqt"
  elif ".pdb" in ligand_filename:
    return "pdb"
  else:
    raise ValueError("Unrecognized_filename")


def compute_centroid(coordinates):
  """Compute the x,y,z centroid of provided coordinates

@@ -1226,75 +1214,10 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
      ]
    raise ValueError('Unknown feature type "%s"' % feature_name)

  def _featurize_complex(self, ligand_ext, ligand_lines, protein_pdb_lines,
                         log_message):
    tempdir = tempfile.mkdtemp()
    if log_message is not None:
      log(log_message)

    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING
    ligand_file = os.path.join(tempdir, "ligand.%s" % ligand_ext)
    with open(ligand_file, "w") as mol_f:
      mol_f.writelines(ligand_lines)
    ############################################################## TIMING
    time2 = time.time()
    log("TIMING: Writing ligand took %0.3f s" % (time2 - time1), self.verbose)
    ############################################################## TIMING

    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING
    protein_pdb_file = os.path.join(tempdir, "protein.pdb")
    with open(protein_pdb_file, "w") as protein_f:
      protein_f.writelines(protein_pdb_lines)
    ############################################################## TIMING
    time2 = time.time()
    log("TIMING: Writing protein took %0.3f s" % (time2 - time1), self.verbose)
    ############################################################## TIMING

    features_dict = self._transform(protein_pdb_file, ligand_file)
    shutil.rmtree(tempdir)
    return list(features_dict.values())

  def featurize_complexes(self, mol_files, protein_pdbs, log_every_n=1000):
    """
    Calculate features for mol/protein complexes.

    Parameters
    ----------
    mols: list
      List of PDB filenames for molecules.
    protein_pdbs: list
      List of PDB filenames for proteins.
    """
    pool = multiprocessing.Pool()
    results = []
    for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_pdbs)):
      log_message = "Featurizing %d / %d" % (
          i, len(mol_files)) if i % log_every_n == 0 else None
      ligand_ext = get_ligand_filetype(mol_file)
      with open(mol_file) as mol_f:
        mol_lines = mol_f.readlines()
      with open(protein_pdb) as protein_file:
        protein_pdb_lines = protein_file.readlines()
      results.append(
          pool.apply_async(
              _featurize_complex,
              (self, ligand_ext, mol_lines, protein_pdb_lines, log_message)))
    pool.close()
    features = []
    for result in results:
      features += result.get()
    features = np.asarray(features)
    return features

  def _transform(self, protein_pdb, ligand_file):
    """Computes featurization of protein/ligand complex.
  def _featurize_complex(self, mol_pdb_file, protein_pdb_file):
    """Computes grid featurization of protein/ligand complex.

    Takes as input files (strings) for pdb of the protein, pdb of the ligand,
    and a directory to save intermediate files.
    Takes as input filenames pdb of the protein, pdb of the ligand.

    This function then computes the centroid of the ligand; decrements this
    centroid from the atomic coordinates of protein and ligand atoms, and then
@@ -1302,13 +1225,20 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
    saved.

    This function then computes a featurization with scheme specified by the user.
    Parameters
    ----------
    mol_pdb_file: Str 
      Filename for ligand pdb file. 
    protein_pdb_file: Str 
      Filename for protein pdb file. 
    """
    try:
      ############################################################## TIMING
      time1 = time.time()
      ############################################################## TIMING

      protein_xyz, protein_rdk = load_molecule(
        protein_pdb, calc_charges=True, sanitize=self.sanitize)
          protein_pdb_file, calc_charges=True, sanitize=self.sanitize)
      ############################################################## TIMING
      time2 = time.time()
      log("TIMING: Loading protein coordinates took %0.3f s" % (time2 - time1),
@@ -1318,12 +1248,15 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
      time1 = time.time()
      ############################################################## TIMING
      ligand_xyz, ligand_rdk = load_molecule(
        ligand_file, calc_charges=True, sanitize=self.sanitize)
          mol_pdb_file, calc_charges=True, sanitize=self.sanitize)
      ############################################################## TIMING
      time2 = time.time()
      log("TIMING: Loading ligand coordinates took %0.3f s" % (time2 - time1),
          self.verbose)
      ############################################################## TIMING
    except MoleculeLoadException:
      logging.warning("Some molecules cannot be loaded by Rdkit. Skipping")
      return None

    ############################################################## TIMING
    time1 = time.time()
@@ -1346,7 +1279,7 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
      rotated_system = rotate_molecules([protein_xyz, ligand_xyz])
      transformed_systems[(i + 1, 0)] = rotated_system

    features = {}
    features_dict = {}
    for system_id, (protein_xyz, ligand_xyz) in transformed_systems.items():
      feature_arrays = []
      for is_flat, function_name in self.feature_types:
@@ -1362,11 +1295,13 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
        feature_arrays += result

        if self.flatten:
          features[system_id] = np.concatenate(
          features_dict[system_id] = np.concatenate(
              [feature_array.flatten() for feature_array in feature_arrays])
        else:
          features[system_id] = np.concatenate(feature_arrays, axis=-1)
          features_dict[system_id] = np.concatenate(feature_arrays, axis=-1)

    # TODO(rbharath): Is this squeeze OK?
    features = np.squeeze(np.array(list(features_dict.values())))
    return features

  def _voxelize(self,
@@ -1464,9 +1399,3 @@ class RdkitGridFeaturizer(ComplexFeaturizer):
      feature_vector[0] += len(feature_list)

    return feature_vector


def _featurize_complex(featurizer, ligand_ext, ligand_lines, protein_pdb_lines,
                       log_message):
  return featurizer._featurize_complex(ligand_ext, ligand_lines,
                                       protein_pdb_lines, log_message)
+29 −38
Original line number Diff line number Diff line
@@ -33,20 +33,6 @@ class TestHelperFunctions(unittest.TestCase):
                                     '3ws9_protein_fixer_rdkit.pdb')
    self.ligand_file = os.path.join(current_dir, '3ws9_ligand.sdf')

  def test_get_ligand_filetype(self):

    supported_extensions = ['mol2', 'sdf', 'pdb', 'pdbqt']
    # some users might try to read smiles with this function
    unsupported_extensions = ['smi', 'ism']

    for extension in supported_extensions:
      fname = 'molecule.%s' % extension
      self.assertEqual(rgf.get_ligand_filetype(fname), extension)

    for extension in unsupported_extensions:
      fname = 'molecule.%s' % extension
      self.assertRaises(ValueError, rgf.get_ligand_filetype, fname)

  def test_load_molecule(self):
    # adding hydrogens and charges is tested in dc.utils
    for add_hydrogens in (True, False):
@@ -367,7 +353,8 @@ class TestFeaturizationFunctions(unittest.TestCase):
        prot_xyz,
        prot_rdk,
        lig_xyz,
        lig_rdk,)
        lig_rdk,
    )
    prot_dict_dist, lig_dict_dist = rgf.featurize_binding_pocket_ecfp(
        prot_xyz, prot_rdk, lig_xyz, lig_rdk, pairwise_distances=distance)
    # ...but first check if we actually got two dicts
@@ -383,13 +370,15 @@ class TestFeaturizationFunctions(unittest.TestCase):
        prot_rdk,
        lig_xyz,
        lig_rdk,
        cutoff=2.0,)
        cutoff=2.0,
    )
    prot_dict_d6, lig_dict_d6 = rgf.featurize_binding_pocket_ecfp(
        prot_xyz,
        prot_rdk,
        lig_xyz,
        lig_rdk,
        cutoff=6.0,)
        cutoff=6.0,
    )
    self.assertLess(len(prot_dict_d2), len(prot_dict))
    # ligands are typically small so all atoms might be present
    self.assertLessEqual(len(lig_dict_d2), len(lig_dict))
@@ -402,7 +391,8 @@ class TestFeaturizationFunctions(unittest.TestCase):
        prot_rdk,
        lig_xyz,
        lig_rdk,
        ecfp_degree=3,)
        ecfp_degree=3,
    )
    self.assertNotEqual(prot_dict_e3, prot_dict)
    self.assertNotEqual(lig_dict_e3, lig_dict)

@@ -419,7 +409,8 @@ class TestFeaturizationFunctions(unittest.TestCase):
          prot_rdk,
          lig_rdk,
          distance,
          bins,)
          bins,
      )

      self.assertIsInstance(splif_dict, dict)
      for (prot_idx, lig_idx), ecfp_pair in splif_dict.items():
@@ -478,7 +469,7 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
    # test if default parameters work
    featurizer = rgf.RdkitGridFeaturizer()
    self.assertIsInstance(featurizer, rgf.RdkitGridFeaturizer)
    feature_tensor = featurizer.featurize_complexes([self.ligand_file],
    feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
                                                       [self.protein_file])
    self.assertIsInstance(feature_tensor, np.ndarray)

@@ -490,7 +481,7 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
        ecfp_power=9,
        splif_power=9,
        flatten=True)
    feature_tensor = featurizer.featurize_complexes([self.ligand_file],
    feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
                                                       [self.protein_file])
    self.assertIsInstance(feature_tensor, np.ndarray)

@@ -499,7 +490,7 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
    featurizer = rgf.RdkitGridFeaturizer(
        feature_types=['ecfp_hashed'], flatten=False)
    featurizer.flatten = True  # False should be ignored with ecfp_hashed
    feature_tensor = featurizer.featurize_complexes([self.ligand_file],
    feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
                                                       [self.protein_file])
    self.assertIsInstance(feature_tensor, np.ndarray)
    self.assertEqual(feature_tensor.shape, (1, 2 * 2**featurizer.ecfp_power))
@@ -516,13 +507,13 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
        splif_power=splif_power,
        flatten=False,
        sanitize=True)
    feature_tensor = featurizer.featurize_complexes([self.ligand_file],
    feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
                                                       [self.protein_file])
    self.assertIsInstance(feature_tensor, np.ndarray)
    voxel_total_len = (
        2**ecfp_power +
        len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power +
        len(featurizer.cutoffs['hbond_dist_bins']) + 5)
        len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power + len(
            featurizer.cutoffs['hbond_dist_bins']) + 5)
    self.assertEqual(feature_tensor.shape, (1, 20, 20, 20, voxel_total_len))

    # test flat features
@@ -532,13 +523,13 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
        ecfp_power=ecfp_power,
        splif_power=splif_power,
        sanitize=True)
    feature_tensor = featurizer.featurize_complexes([self.ligand_file],
    feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
                                                       [self.protein_file])
    self.assertIsInstance(feature_tensor, np.ndarray)
    flat_total_len = (
        3 * 2**ecfp_power +
        len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power +
        len(featurizer.cutoffs['hbond_dist_bins']))
        len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power + len(
            featurizer.cutoffs['hbond_dist_bins']))
    self.assertEqual(feature_tensor.shape, (1, flat_total_len))

    # check if aromatic features are ignores if sanitize=False
@@ -552,7 +543,7 @@ class TestRdkitGridFeaturizer(unittest.TestCase):

    self.assertTrue('pi_stack' not in featurizer.feature_types)
    self.assertTrue('cation_pi' not in featurizer.feature_types)
    feature_tensor = featurizer.featurize_complexes([self.ligand_file],
    feature_tensor, _ = featurizer.featurize_complexes([self.ligand_file],
                                                       [self.protein_file])
    self.assertIsInstance(feature_tensor, np.ndarray)
    total_len = voxel_total_len + flat_total_len - 3 - 2**ecfp_power
@@ -580,9 +571,9 @@ class TestRdkitGridFeaturizer(unittest.TestCase):
        feature_types=['voxel_combined'],
        flatten=False,
        sanitize=True)
    feature_tensors = featurizer.featurize_complexes([self.ligand_file],
    feature_tensors, _ = featurizer.featurize_complexes([self.ligand_file],
                                                        [self.protein_file])
    self.assertEqual(len(feature_tensors), 4)
    self.assertEqual(feature_tensors.shape, (1, 4, 16, 16, 16, 40))

  def test_voxelize(self):
    prot_xyz, prot_rdk = rgf.load_molecule(self.protein_file)
Loading