Commit 52268a72 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

First attempt at Complex Featurizer

parent aac23085
Loading
Loading
Loading
Loading
+103 −43
Original line number Diff line number Diff line
@@ -11,6 +11,9 @@ __license__ = "LGPL v2.1+"

import numpy as np
from deepchem.featurizers import Featurizer
from deepchem.featurizers import ComplexFeaturizer
from deepchem.featurizers.grid_featurizer import load_molecule 
from deepchem.featurizers.grid_featurizer import merge_molecules

class AtomicCoordinates(Featurizer):
  """
@@ -45,6 +48,51 @@ class AtomicCoordinates(Featurizer):
    coords = [coords]
    return coords

def compute_neighbor_list(coords, neighbor_cutoff, max_num_neighbors):
  """Computes a neighbor list from atom coordinates."""
  x_bins, y_bins, z_bins = get_cells(coords, neighbor_cutoff)

  # Associate each atom with cell it belongs to. O(N)
  cell_to_atoms, atom_to_cell = put_atoms_in_cells(
      coords, x_bins, y_bins, z_bins)

  # Associate each cell with its neighbor cells. Assumes periodic boundary
  # conditions, so does wrapround. O(constant)
  N_x, N_y, N_z = len(x_bins), len(y_bins), len(z_bins)
  neighbor_cell_map = compute_neighbor_cell_map(N_x, N_y, N_z)

  # For each atom, loop through all atoms in its cell and neighboring cells.
  # Accept as neighbors only those within threshold. This computation should be
  # O(Nm), where m is the number of atoms within a set of neighboring-cells.
  neighbor_list = {}
  for atom in range(N):
    cell = atom_to_cell[atom]
    neighbor_cells = neighbor_cell_map[cell]
    # For smaller systems especially, the periodic boundary conditions can
    # result in neighboring cells being seen multiple times. Use a set() to
    # make sure duplicate neighbors are ignored. Convert back to list before
    # returning. 
    neighbor_list[atom] = set()
    for neighbor_cell in neighbor_cells:
      atoms_in_cell = cell_to_atoms[neighbor_cell]
      for neighbor_atom in atoms_in_cell:
        if neighbor_atom == atom:
          continue
        # TODO(rbharath): How does distance need to be modified here to
        # account for periodic boundary conditions?
        dist = np.linalg.norm(coords[atom] - coords[neighbor_atom])
        if dist < neighbor_cutoff:
          neighbor_list[atom].add((neighbor_atom, dist))
        
    # Sort neighbors by distance
    closest_neighbors = sorted(
        list(neighbor_list[atom]), key=lambda elt: elt[1])
    closest_neighbors = [nbr for (nbr, dist) in closest_neighbors]
    # Pick up to max_num_neighbors
    closest_neighbors = closest_neighbors[:max_num_neighbors]
    neighbor_list[atom] = closest_neighbors
    return neighbor_list

def get_cells(coords, neighbor_cutoff):
  """Computes cells given molecular coordinates."""
  x_max, x_min = np.amax(coords[:, 0]), np.amin(coords[:, 0])
@@ -187,53 +235,65 @@ class NeighborListAtomicCoordinates(Featurizer):

    Parameters
    ----------
      mol: rdkit Mol
        To be featurized.
    """
    N = mol.GetNumAtoms()
    # TODO(rbharath): Should this return a list?
    bohr_coords = self.coordinates_featurizer._featurize(mol)[0]
    coords = get_coords(mol)
    neighbor_list = compute_neighbor_list(
        coords, self.neighbor_cutoff, self.max_num_neighbors)
        
    x_bins, y_bins, z_bins = get_cells(coords, self.neighbor_cutoff)

    # Associate each atom with cell it belongs to. O(N)
    cell_to_atoms, atom_to_cell = put_atoms_in_cells(
        coords, x_bins, y_bins, z_bins)

    # Associate each cell with its neighbor cells. Assumes periodic boundary
    # conditions, so does wrapround. O(constant)
    N_x, N_y, N_z = len(x_bins), len(y_bins), len(z_bins)
    neighbor_cell_map = compute_neighbor_cell_map(N_x, N_y, N_z)
    return (bohr_coords, neighbor_list)

    # For each atom, loop through all atoms in its cell and neighboring cells.
    # Accept as neighbors only those within threshold. This computation should be
    # O(Nm), where m is the number of atoms within a set of neighboring-cells.
    neighbor_list = {}
    for atom in range(N):
      cell = atom_to_cell[atom]
      neighbor_cells = neighbor_cell_map[cell]
      # For smaller systems especially, the periodic boundary conditions can
      # result in neighboring cells being seen multiple times. Use a set() to
      # make sure duplicate neighbors are ignored. Convert back to list before
      # returning. 
      neighbor_list[atom] = set()
      for neighbor_cell in neighbor_cells:
        atoms_in_cell = cell_to_atoms[neighbor_cell]
        for neighbor_atom in atoms_in_cell:
          if neighbor_atom == atom:
            continue
          # TODO(rbharath): How does distance need to be modified here to
          # account for periodic boundary conditions?
          dist = np.linalg.norm(coords[atom] - coords[neighbor_atom])
          if dist < self.neighbor_cutoff:
            neighbor_list[atom].add((neighbor_atom, dist))
# TODO(rbharath): This shares a lot of code with NeighborListAtomicCoordinates.
# Is there some elegant way to refactor to avoid code duplication?
class NeighborListComplexAtomicCoordinates(ComplexFeaturizer):
  """
  Adjacency list of neighbors for protein-ligand complexes in 3-space.

      # Sort neighbors by distance
      closest_neighbors = sorted(
          list(neighbor_list[atom]), key=lambda elt: elt[1])
      closest_neighbors = [nbr for (nbr, dist) in closest_neighbors]
      # Pick up to max_num_neighbors
      closest_neighbors = closest_neighbors[:self.max_num_neighbors]
      neighbor_list[atom] = closest_neighbors
  Neighbors dtermined by user-dfined distance cutoff.
  """
  def __init__(self, max_num_neighbors=None, neighbor_cutoff=4):
    if neighbor_cutoff <= 0:
      raise ValueError("neighbor_cutoff must be positive value.")
    if max_num_neighbors is not None:
      if not isinstance(max_num_neighbors, int) or max_num_neighbors <= 0:
        raise ValueError("max_num_neighbors must be positive integer.")
    self.max_num_neighbors = max_num_neighbors
    self.neighbor_cutoff = neighbor_cutoff
    # Type of data created by this featurizer
    self.dtype = object
    self.coordinates_featurizer = AtomicCoordinates()

  def _featurize_complex(self, mol_pdb_file, protein_pdb_file):
    """
    Compute neighbor list for complex.

    return (bohr_coords, neighbor_list)
    Parameters
    ----------
    mol_pdb: list
      Should be a list of lines of the PDB file.
    complex_pdb: list
      Should be a list of lines of the PDB file.
    """
    mol_coords, ob_mol = load_molecule(mol_pdb_file)
    protein_coords, protein_mol = load_molecule(protein_pdb_file)
    ########################################################## DEBUG
    print("mol_pdb_file, protein_pdb_file")
    print(mol_pdb_file, protein_pdb_file)
    print("type(mol_coords), type(ob_mol)")
    print(type(mol_coords), type(ob_mol))
    print("type(protein_coords), type(protein_mol)")
    print(type(protein_coords), type(protein_mol))
    print("mol_coords.shape, protein_coords.shape")
    print(mol_coords.shape, protein_coords.shape)
    ########################################################## DEBUG
    system_coords, system_mol = merge_molecules(
        mol_coords, ob_mol, protein_coords, protein_mol)
    
    system_neighbor_list = compute_neighbor_list(
        system_coords, self.neighbor_cutoff, self.max_num_neighbors)

    return (system_coords, system_neighbor_list)
+73 −60
Original line number Diff line number Diff line
@@ -28,6 +28,77 @@ import time
"""
http://stackoverflow.com/questions/38987/how-can-i-merge-two-python-dictionaries-in-a-single-expression
"""
def get_xyz_from_ob(ob_mol):
  """
  returns an m x 3 np array of 3d coords
  of given openbabel molecule
  """

  xyz = np.zeros((ob_mol.NumAtoms(), 3))
  for i, atom in enumerate(ob.OBMolAtomIter(ob_mol)):
    xyz[i, 0] = atom.x()
    xyz[i, 1] = atom.y()
    xyz[i, 2] = atom.z()
  return(xyz)


def load_molecule(molecule_file, remove_hydrogens=True,
                  calc_charges=False):
  """Converts molecule file to (xyz-coords, obmol object)

  Given molecule_file, returns a tuple of xyz coords of molecule
  and an openbabel object representing that molecule
  """

  if ".mol2" in molecule_file:
    ###################################################### DEBUG
    print("mol2 file")
    ###################################################### DEBUG
    obConversion = ob.OBConversion()
    obConversion.SetInAndOutFormats(str("mol2"), str("mol2"))
    ob_mol = ob.OBMol()
    obConversion.ReadFile(ob_mol, molecule_file)
  else:
    ###################################################### DEBUG
    print("non mol2 file")
    ###################################################### DEBUG
    obConversion = ob.OBConversion()
    obConversion.SetInAndOutFormats(str("pdb"), str("pdb"))
    ob_mol = ob.OBMol()
    obConversion.ReadFile(ob_mol, str(molecule_file))
    ###################################################### DEBUG
    print("ob_mol.NumAtoms()")
    print(ob_mol.NumAtoms())
    ###################################################### DEBUG

  if calc_charges:
    gasteiger = ob.OBChargeModel.FindType(str("gasteiger"))
    gasteiger.ComputeCharges(ob_mol)
    ob_mol.UnsetImplicitValencePerceived()
    ob_mol.CorrectForPH(7.4)
    ob_mol.AddHydrogens()
  else:
    ob_mol.AddHydrogens()

  xyz = get_xyz_from_ob(ob_mol)

  return xyz, ob_mol

def merge_molecules(protein_xyz, protein, ligand_xyz, ligand):
  """Merges coordinates of protein and ligand.

  Takes as input protein and ligand objects of class PDB and adds ligand atoms
  to the protein, and returns the new instance of class PDB called system that
  contains both sets of atoms.
  """

  system_xyz = np.array(np.vstack(np.vstack((protein_xyz, ligand_xyz))))
  system_ob = ob.OBMol(protein_ob)
  system_ob += ligand_ob

  return system_xyz, system_ob


def _merge_two_dicts(x, y):
  """Given two dicts, merge them into a new dict as a shallow copy."""
  z = x.copy()
@@ -749,20 +820,6 @@ def _convert_atom_pair_to_voxel(
                        atom_index_pair[1], box_width, voxel_width)[0])
  return(indices_list)

def _merge_molecules(protein_xyz, protein, ligand_xyz, ligand):
  """Merges coordinates of protein and ligand.

  Takes as input protein and ligand objects of class PDB and adds ligand atoms
  to the protein, and returns the new instance of class PDB called system that
  contains both sets of atoms.
  """

  system_xyz = np.array(np.vstack(np.vstack((protein_xyz, ligand_xyz))))
  system_ob = ob.OBMol(protein_ob)
  system_ob += ligand_ob

  return system_xyz, system_ob

def _compute_charge_dictionary(molecule):
  """Computes partial charges for each atom."""
  charge_dictionary = {}
@@ -900,7 +957,7 @@ class GridFeaturizer(ComplexFeaturizer):
      "/")[len(str(protein_pdb).split("/")) - 2]

    if not self.ligand_only:
      protein_xyz, protein_ob = self._load_molecule(
      protein_xyz, protein_ob = load_molecule(
          protein_pdb, calc_charges=False)
    ############################################################## TIMING
    time2 = time.time()
@@ -910,7 +967,7 @@ class GridFeaturizer(ComplexFeaturizer):
    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING
    ligand_xyz, ligand_ob = self._load_molecule(
    ligand_xyz, ligand_ob = load_molecule(
        ligand_pdb, calc_charges=False)
    ############################################################## TIMING
    time2 = time.time()
@@ -1243,50 +1300,6 @@ class GridFeaturizer(ComplexFeaturizer):

    return feature_vector

  def _get_xyz_from_ob(self, ob_mol):
    """
    returns an m x 3 np array of 3d coords
    of given openbabel molecule
    """

    xyz = np.zeros((ob_mol.NumAtoms(), 3))
    for i, atom in enumerate(ob.OBMolAtomIter(ob_mol)):
      xyz[i, 0] = atom.x()
      xyz[i, 1] = atom.y()
      xyz[i, 2] = atom.z()
    return(xyz)

  def _load_molecule(self, molecule_file, remove_hydrogens=True,
                     calc_charges=False):
    """
    given molecule_file, returns a tuple of xyz coords of molecule
    and an openbabel object representing that molecule
    """

    if ".mol2" in molecule_file:
      obConversion = ob.OBConversion()
      obConversion.SetInAndOutFormats(str("mol2"), str("mol2"))
      ob_mol = ob.OBMol()
      obConversion.ReadFile(ob_mol, molecule_file)
    else:
      obConversion = ob.OBConversion()
      obConversion.SetInAndOutFormats(str("pdb"), str("pdb"))
      ob_mol = ob.OBMol()
      obConversion.ReadFile(ob_mol, str(molecule_file))

    if calc_charges:
      gasteiger = ob.OBChargeModel.FindType(str("gasteiger"))
      gasteiger.ComputeCharges(ob_mol)
      ob_mol.UnsetImplicitValencePerceived()
      ob_mol.CorrectForPH(7.4)
      ob_mol.AddHydrogens()
    else:
      ob_mol.AddHydrogens()

    xyz = self._get_xyz_from_ob(ob_mol)

    return xyz, ob_mol

  def _generate_box(self, mol):
    """Removes atoms outside box.

+14 −0
Original line number Diff line number Diff line
"""
Test atomic coordinates and neighbor lists.
"""
import os
import numpy as np
import unittest
from rdkit import Chem
@@ -11,6 +12,7 @@ from deepchem.featurizers.atomic_coordinates import put_atoms_in_cells
from deepchem.featurizers.atomic_coordinates import compute_neighbor_cell_map
from deepchem.featurizers.atomic_coordinates import AtomicCoordinates
from deepchem.featurizers.atomic_coordinates import NeighborListAtomicCoordinates
from deepchem.featurizers.atomic_coordinates import NeighborListComplexAtomicCoordinates

class TestAtomicCoordinates(unittest.TestCase):
  """
@@ -228,3 +230,15 @@ class TestAtomicCoordinates(unittest.TestCase):
        assert nblist[i] == [closest_nbr]
      else:
        assert nblist[i] == []

  def test_complex_featurization_simple(self):
    """Test Neighbor List computation on protein-ligand complex."""
    dir_path = os.path.dirname(os.path.realpath(__file__))
    ligand_file = os.path.join(dir_path, "data/3zso_ligand_hyd.pdb")
    protein_file = os.path.join(dir_path, "data/3zso_protein.pdb")
    max_num_neighbors = 4
    complex_featurizer = NeighborListComplexAtomicCoordinates(max_num_neighbors)

    system_coords, system_neighbor_list = complex_featurizer._featurize_complex(
        ligand_file, protein_file)