Commit 354bdb65 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #231 from rbharath/complex_support

Extending Neighbor List to Complexes
parents 6151d9df 5a537763
Loading
Loading
Loading
Loading
+61 −4
Original line number Diff line number Diff line
@@ -17,6 +17,7 @@ from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import BalancingTransformer
from deepchem.featurizers.nnscore import NNScoreComplexFeaturizer
from deepchem.featurizers.grid_featurizer import GridFeaturizer
from deepchem.featurizers.atomic_coordinates import NeighborListComplexAtomicCoordinates

def load_pdbbind_labels(labels_file):
  """Loads pdbbind labels as dataframe"""
@@ -44,9 +45,7 @@ def compute_pdbbind_grid_feature(compound_featurizers, complex_featurizers,
  for complex_featurizer in complex_featurizers:
    features = complex_featurizer.featurize_complexes(
      [ligand_file], [protein_file])
    ################################################ DEBUG
    all_features.append(np.squeeze(features))
    ################################################ DEBUG
  
  for compound_featurizer in compound_featurizers:
    features = np.squeeze(compound_featurizer.featurize([rdkit_mol]))
@@ -55,6 +54,66 @@ def compute_pdbbind_grid_feature(compound_featurizers, complex_featurizers,
  features = np.concatenate(all_features)
  return features

def compute_pdbbind_coordinate_features(
    complex_featurizer, pdb_subdir, pdb_code):
  """Compute features for a given complex"""
  protein_file = os.path.join(pdb_subdir, "%s_protein.pdb" % pdb_code)
  ligand_file = os.path.join(pdb_subdir, "%s_ligand.sdf" % pdb_code)

  feature = complex_featurizer.featurize_complexes(
    [ligand_file], [protein_file])
  return feature

def load_core_pdbbind_coordinates(pdbbind_dir, base_dir, reload=True):
  """Load PDBBind datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"
  regen = False
  neighbor_cutoff = 4
  max_num_neighbors = 10

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")

  # Load PDBBind dataset
  labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
  pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set")
  tasks = ["-logKd/Ki"]
  print("About to load contents.")
  contents_df = load_pdbbind_labels(labels_file)
  ids = contents_df["PDB code"].values
  y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

  # Define featurizers
  featurizer = NeighborListComplexAtomicCoordinates(
      max_num_neighbors, neighbor_cutoff)
  
  # Featurize Dataset
  features = []
  for ind, pdb_code in enumerate(ids):
    print("Processing %s" % str(pdb_code))
    pdb_subdir = os.path.join(pdb_subdirs, pdb_code)
    computed_feature = compute_pdbbind_coordinate_features(
        featurizer, pdb_subdir, pdb_code)
    features.append(computed_feature)
  X = np.array(features, dtype-object)
  w = np.ones_like(y)
   
  dataset = Dataset.from_numpy(data_dir, X, y, w, ids)
  transformers = []
  
  return tasks, dataset, transformers

def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True):
  """Load PDBBind datasets. Does not do train/test split"""
  # Set some global variables up top
@@ -113,9 +172,7 @@ def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True):
      continue
    y_inds.append(ind)
    features.append(computed_feature)
  ############################################################# DEBUG
  y = y[y_inds]
  ############################################################# DEBUG
  X = np.vstack(features)
  w = np.ones_like(y)
   
+0 −13
Original line number Diff line number Diff line
@@ -8,24 +8,11 @@ from __future__ import unicode_literals
import os
import numpy as np
import shutil
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.splits import ScaffoldSplitter
from deepchem.splits import RandomSplitter
from deepchem.datasets import Dataset
from deepchem.transformers import BalancingTransformer
from deepchem.hyperparameters import HyperparamOpt
from deepchem.models.multitask import SingletaskToMultitask
from deepchem import metrics
from deepchem.metrics import Metric
from deepchem.metrics import to_one_hot
from deepchem.models.sklearn_models import SklearnModel
from deepchem.utils.evaluate import relative_difference
from deepchem.utils.evaluate import Evaluator

def load_tox21(base_dir, reload=True):
  """Load Tox21 datasets. Does not do train/test split"""
+92 −43
Original line number Diff line number Diff line
@@ -11,6 +11,9 @@ __license__ = "LGPL v2.1+"

import numpy as np
from deepchem.featurizers import Featurizer
from deepchem.featurizers import ComplexFeaturizer
from deepchem.featurizers.grid_featurizer import load_molecule 
from deepchem.featurizers.grid_featurizer import merge_molecules

class AtomicCoordinates(Featurizer):
  """
@@ -45,6 +48,52 @@ class AtomicCoordinates(Featurizer):
    coords = [coords]
    return coords

def compute_neighbor_list(coords, neighbor_cutoff, max_num_neighbors):
  """Computes a neighbor list from atom coordinates."""
  N = coords.shape[0]
  x_bins, y_bins, z_bins = get_cells(coords, neighbor_cutoff)

  # Associate each atom with cell it belongs to. O(N)
  cell_to_atoms, atom_to_cell = put_atoms_in_cells(
      coords, x_bins, y_bins, z_bins)

  # Associate each cell with its neighbor cells. Assumes periodic boundary
  # conditions, so does wrapround. O(constant)
  N_x, N_y, N_z = len(x_bins), len(y_bins), len(z_bins)
  neighbor_cell_map = compute_neighbor_cell_map(N_x, N_y, N_z)

  # For each atom, loop through all atoms in its cell and neighboring cells.
  # Accept as neighbors only those within threshold. This computation should be
  # O(Nm), where m is the number of atoms within a set of neighboring-cells.
  neighbor_list = {}
  for atom in range(N):
    cell = atom_to_cell[atom]
    neighbor_cells = neighbor_cell_map[cell]
    # For smaller systems especially, the periodic boundary conditions can
    # result in neighboring cells being seen multiple times. Use a set() to
    # make sure duplicate neighbors are ignored. Convert back to list before
    # returning. 
    neighbor_list[atom] = set()
    for neighbor_cell in neighbor_cells:
      atoms_in_cell = cell_to_atoms[neighbor_cell]
      for neighbor_atom in atoms_in_cell:
        if neighbor_atom == atom:
          continue
        # TODO(rbharath): How does distance need to be modified here to
        # account for periodic boundary conditions?
        dist = np.linalg.norm(coords[atom] - coords[neighbor_atom])
        if dist < neighbor_cutoff:
          neighbor_list[atom].add((neighbor_atom, dist))
        
    # Sort neighbors by distance
    closest_neighbors = sorted(
        list(neighbor_list[atom]), key=lambda elt: elt[1])
    closest_neighbors = [nbr for (nbr, dist) in closest_neighbors]
    # Pick up to max_num_neighbors
    closest_neighbors = closest_neighbors[:max_num_neighbors]
    neighbor_list[atom] = closest_neighbors
  return neighbor_list

def get_cells(coords, neighbor_cutoff):
  """Computes cells given molecular coordinates."""
  x_max, x_min = np.amax(coords[:, 0]), np.amin(coords[:, 0])
@@ -187,53 +236,53 @@ class NeighborListAtomicCoordinates(Featurizer):

    Parameters
    ----------
      mol: rdkit Mol
        To be featurized.
    """
    N = mol.GetNumAtoms()
    # TODO(rbharath): Should this return a list?
    bohr_coords = self.coordinates_featurizer._featurize(mol)[0]
    coords = get_coords(mol)
    neighbor_list = compute_neighbor_list(
        coords, self.neighbor_cutoff, self.max_num_neighbors)
        
    x_bins, y_bins, z_bins = get_cells(coords, self.neighbor_cutoff)
    return (bohr_coords, neighbor_list)

    # Associate each atom with cell it belongs to. O(N)
    cell_to_atoms, atom_to_cell = put_atoms_in_cells(
        coords, x_bins, y_bins, z_bins)
class NeighborListComplexAtomicCoordinates(ComplexFeaturizer):
  """
  Adjacency list of neighbors for protein-ligand complexes in 3-space.

    # Associate each cell with its neighbor cells. Assumes periodic boundary
    # conditions, so does wrapround. O(constant)
    N_x, N_y, N_z = len(x_bins), len(y_bins), len(z_bins)
    neighbor_cell_map = compute_neighbor_cell_map(N_x, N_y, N_z)
  Neighbors dtermined by user-dfined distance cutoff.
  """
  def __init__(self, max_num_neighbors=None, neighbor_cutoff=4):
    if neighbor_cutoff <= 0:
      raise ValueError("neighbor_cutoff must be positive value.")
    if max_num_neighbors is not None:
      if not isinstance(max_num_neighbors, int) or max_num_neighbors <= 0:
        raise ValueError("max_num_neighbors must be positive integer.")
    self.max_num_neighbors = max_num_neighbors
    self.neighbor_cutoff = neighbor_cutoff
    # Type of data created by this featurizer
    self.dtype = object
    self.coordinates_featurizer = AtomicCoordinates()

    # For each atom, loop through all atoms in its cell and neighboring cells.
    # Accept as neighbors only those within threshold. This computation should be
    # O(Nm), where m is the number of atoms within a set of neighboring-cells.
    neighbor_list = {}
    for atom in range(N):
      cell = atom_to_cell[atom]
      neighbor_cells = neighbor_cell_map[cell]
      # For smaller systems especially, the periodic boundary conditions can
      # result in neighboring cells being seen multiple times. Use a set() to
      # make sure duplicate neighbors are ignored. Convert back to list before
      # returning. 
      neighbor_list[atom] = set()
      for neighbor_cell in neighbor_cells:
        atoms_in_cell = cell_to_atoms[neighbor_cell]
        for neighbor_atom in atoms_in_cell:
          if neighbor_atom == atom:
            continue
          # TODO(rbharath): How does distance need to be modified here to
          # account for periodic boundary conditions?
          dist = np.linalg.norm(coords[atom] - coords[neighbor_atom])
          if dist < self.neighbor_cutoff:
            neighbor_list[atom].add((neighbor_atom, dist))
  def _featurize_complex(self, mol_pdb_file, protein_pdb_file):
    """
    Compute neighbor list for complex.

      # Sort neighbors by distance
      closest_neighbors = sorted(
          list(neighbor_list[atom]), key=lambda elt: elt[1])
      closest_neighbors = [nbr for (nbr, dist) in closest_neighbors]
      # Pick up to max_num_neighbors
      closest_neighbors = closest_neighbors[:self.max_num_neighbors]
      neighbor_list[atom] = closest_neighbors
    Parameters
    ----------
    mol_pdb: list
      Should be a list of lines of the PDB file.
    complex_pdb: list
      Should be a list of lines of the PDB file.
    """
    mol_coords, ob_mol = load_molecule(mol_pdb_file)
    protein_coords, protein_mol = load_molecule(protein_pdb_file)
    system_coords, system_mol = merge_molecules(
        mol_coords, ob_mol, protein_coords, protein_mol)
    
    system_neighbor_list = compute_neighbor_list(
        system_coords, self.neighbor_cutoff, self.max_num_neighbors)

    return (bohr_coords, neighbor_list)
    return (system_coords, system_neighbor_list)
+70 −60
Original line number Diff line number Diff line
@@ -28,6 +28,74 @@ import time
"""
http://stackoverflow.com/questions/38987/how-can-i-merge-two-python-dictionaries-in-a-single-expression
"""
def get_xyz_from_ob(ob_mol):
  """
  returns an m x 3 np array of 3d coords
  of given openbabel molecule
  """

  xyz = np.zeros((ob_mol.NumAtoms(), 3))
  for i, atom in enumerate(ob.OBMolAtomIter(ob_mol)):
    xyz[i, 0] = atom.x()
    xyz[i, 1] = atom.y()
    xyz[i, 2] = atom.z()
  return(xyz)


def load_molecule(molecule_file, remove_hydrogens=True,
                  calc_charges=False):
  """Converts molecule file to (xyz-coords, obmol object)

  Given molecule_file, returns a tuple of xyz coords of molecule
  and an openbabel object representing that molecule
  """

  if ".mol2" in molecule_file:
    obConversion = ob.OBConversion()
    obConversion.SetInAndOutFormats(str("mol2"), str("mol2"))
    ob_mol = ob.OBMol()
    obConversion.ReadFile(ob_mol, str(molecule_file))
  elif ".sdf" in molecule_file:
    obConversion = ob.OBConversion()
    obConversion.SetInAndOutFormats(str("sdf"), str("sdf"))
    ob_mol = ob.OBMol()
    obConversion.ReadFile(ob_mol, str(molecule_file))
  elif ".pdb" in molecule_file:
    obConversion = ob.OBConversion()
    obConversion.SetInAndOutFormats(str("pdb"), str("pdb"))
    ob_mol = ob.OBMol()
    obConversion.ReadFile(ob_mol, str(molecule_file))
  else:
    raise ValueError("Unrecognized file type")

  if calc_charges:
    gasteiger = ob.OBChargeModel.FindType(str("gasteiger"))
    gasteiger.ComputeCharges(ob_mol)
    ob_mol.UnsetImplicitValencePerceived()
    ob_mol.CorrectForPH(7.4)
    ob_mol.AddHydrogens()
  else:
    ob_mol.AddHydrogens()

  xyz = get_xyz_from_ob(ob_mol)

  return xyz, ob_mol

def merge_molecules(protein_xyz, protein, ligand_xyz, ligand):
  """Merges coordinates of protein and ligand.

  Takes as input protein and ligand objects of class PDB and adds ligand atoms
  to the protein, and returns the new instance of class PDB called system that
  contains both sets of atoms.
  """

  system_xyz = np.array(np.vstack(np.vstack((protein_xyz, ligand_xyz))))
  system_ob = ob.OBMol(protein)
  system_ob += ligand

  return system_xyz, system_ob


def _merge_two_dicts(x, y):
  """Given two dicts, merge them into a new dict as a shallow copy."""
  z = x.copy()
@@ -749,20 +817,6 @@ def _convert_atom_pair_to_voxel(
                        atom_index_pair[1], box_width, voxel_width)[0])
  return(indices_list)

def _merge_molecules(protein_xyz, protein, ligand_xyz, ligand):
  """Merges coordinates of protein and ligand.

  Takes as input protein and ligand objects of class PDB and adds ligand atoms
  to the protein, and returns the new instance of class PDB called system that
  contains both sets of atoms.
  """

  system_xyz = np.array(np.vstack(np.vstack((protein_xyz, ligand_xyz))))
  system_ob = ob.OBMol(protein_ob)
  system_ob += ligand_ob

  return system_xyz, system_ob

def _compute_charge_dictionary(molecule):
  """Computes partial charges for each atom."""
  charge_dictionary = {}
@@ -900,7 +954,7 @@ class GridFeaturizer(ComplexFeaturizer):
      "/")[len(str(protein_pdb).split("/")) - 2]

    if not self.ligand_only:
      protein_xyz, protein_ob = self._load_molecule(
      protein_xyz, protein_ob = load_molecule(
          protein_pdb, calc_charges=False)
    ############################################################## TIMING
    time2 = time.time()
@@ -910,7 +964,7 @@ class GridFeaturizer(ComplexFeaturizer):
    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING
    ligand_xyz, ligand_ob = self._load_molecule(
    ligand_xyz, ligand_ob = load_molecule(
        ligand_pdb, calc_charges=False)
    ############################################################## TIMING
    time2 = time.time()
@@ -1243,50 +1297,6 @@ class GridFeaturizer(ComplexFeaturizer):

    return feature_vector

  def _get_xyz_from_ob(self, ob_mol):
    """
    returns an m x 3 np array of 3d coords
    of given openbabel molecule
    """

    xyz = np.zeros((ob_mol.NumAtoms(), 3))
    for i, atom in enumerate(ob.OBMolAtomIter(ob_mol)):
      xyz[i, 0] = atom.x()
      xyz[i, 1] = atom.y()
      xyz[i, 2] = atom.z()
    return(xyz)

  def _load_molecule(self, molecule_file, remove_hydrogens=True,
                     calc_charges=False):
    """
    given molecule_file, returns a tuple of xyz coords of molecule
    and an openbabel object representing that molecule
    """

    if ".mol2" in molecule_file:
      obConversion = ob.OBConversion()
      obConversion.SetInAndOutFormats(str("mol2"), str("mol2"))
      ob_mol = ob.OBMol()
      obConversion.ReadFile(ob_mol, molecule_file)
    else:
      obConversion = ob.OBConversion()
      obConversion.SetInAndOutFormats(str("pdb"), str("pdb"))
      ob_mol = ob.OBMol()
      obConversion.ReadFile(ob_mol, str(molecule_file))

    if calc_charges:
      gasteiger = ob.OBChargeModel.FindType(str("gasteiger"))
      gasteiger.ComputeCharges(ob_mol)
      ob_mol.UnsetImplicitValencePerceived()
      ob_mol.CorrectForPH(7.4)
      ob_mol.AddHydrogens()
    else:
      ob_mol.AddHydrogens()

    xyz = self._get_xyz_from_ob(ob_mol)

    return xyz, ob_mol

  def _generate_box(self, mol):
    """Removes atoms outside box.

+18 −0
Original line number Diff line number Diff line
"""
Test atomic coordinates and neighbor lists.
"""
import os
import numpy as np
import unittest
from rdkit import Chem
@@ -11,6 +12,7 @@ from deepchem.featurizers.atomic_coordinates import put_atoms_in_cells
from deepchem.featurizers.atomic_coordinates import compute_neighbor_cell_map
from deepchem.featurizers.atomic_coordinates import AtomicCoordinates
from deepchem.featurizers.atomic_coordinates import NeighborListAtomicCoordinates
from deepchem.featurizers.atomic_coordinates import NeighborListComplexAtomicCoordinates

class TestAtomicCoordinates(unittest.TestCase):
  """
@@ -228,3 +230,19 @@ class TestAtomicCoordinates(unittest.TestCase):
        assert nblist[i] == [closest_nbr]
      else:
        assert nblist[i] == []

  def test_complex_featurization_simple(self):
    """Test Neighbor List computation on protein-ligand complex."""
    dir_path = os.path.dirname(os.path.realpath(__file__))
    ligand_file = os.path.join(dir_path, "data/3zso_ligand_hyd.pdb")
    protein_file = os.path.join(dir_path, "data/3zso_protein.pdb")
    max_num_neighbors = 4
    complex_featurizer = NeighborListComplexAtomicCoordinates(max_num_neighbors)

    system_coords, system_neighbor_list = complex_featurizer._featurize_complex(
        ligand_file, protein_file)
  
    N = system_coords.shape[0]
    assert len(system_neighbor_list.keys()) == N
    for atom in range(N):
      assert len(system_neighbor_list[atom]) <= max_num_neighbors
Loading