Commit aa3d7f65 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Preliminary binding pocket dataset

parent 24ac9c6d
Loading
Loading
Loading
Loading
+9 −5
Original line number Diff line number Diff line
@@ -21,8 +21,8 @@ from subprocess import call

def extract_active_site(protein_file, ligand_file, cutoff=4):
  """Extracts a box for the active site."""
  protein_coords = load_molecule(protein_file)[0]
  ligand_coords = load_molecule(ligand_file)[0]
  protein_coords = load_molecule(protein_file, add_hydrogens=False)[0]
  ligand_coords = load_molecule(ligand_file, add_hydrogens=False)[0]
  num_ligand_atoms = len(ligand_coords)
  num_protein_atoms = len(protein_coords)
  pocket_inds = []
@@ -179,13 +179,17 @@ class ConvexHullPocketFinder(BindingPocketFinder):
  def find_all_pockets(self, protein_file):
    """Find list of binding pockets on protein."""
    # protein_coords is (N, 3) tensor
    coords = load_molecule(protein_file)[0]
    coords = load_molecule(protein_file, add_hydrogens=False)[0]
    return get_all_boxes(coords, self.pad)

  def find_pockets(self, protein_file, ligand_file):
    """Find list of suitable binding pockets on protein."""
    protein_coords = load_molecule(protein_file)[0]
    ligand_coords = load_molecule(ligand_file)[0]
    protein_coords = load_molecule(protein_file, add_hydrogens=False)[0]
    ######################################################################### DEBUG
    #print("protein_coords")
    #print(protein_coords)
    ######################################################################### DEBUG
    ligand_coords = load_molecule(ligand_file, add_hydrogens=False)[0]
    boxes = get_all_boxes(protein_coords, self.pad)
    mapping = boxes_to_atoms(protein_coords, boxes)
    pockets, pocket_atoms = merge_overlapping_boxes(mapping, boxes)
+27 −6
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@ __author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "GPL"

import mdtraj as md
import unittest
import tempfile
import os
@@ -16,12 +17,12 @@ import shutil
import numpy as np
import deepchem as dc

class TestPoseGeneration(unittest.TestCase):
class TestBindingPocket(unittest.TestCase):
  """
  Does sanity checks on pose generation. 
  Does sanity checks on binding pocket generation. 
  """

  def test_convex_rf_init(self):
  def test_convex_init(self):
    """Tests that ConvexHullPocketFinder can be initialized."""
    finder = dc.dock.ConvexHullPocketFinder()

@@ -115,16 +116,36 @@ class TestPoseGeneration(unittest.TestCase):
    assert len(merged_boxes) == 1
    assert merged_boxes[0] == ((1, 3), (1, 3), (1, 3))

  def test_convex_rf_find_pockets(self):
  def test_convex_find_pockets(self):
    """Test that some pockets are filtered out."""
    current_dir = os.path.dirname(os.path.realpath(__file__))
    protein_file = os.path.join(current_dir, "1jld_protein.pdb")
    ligand_file = os.path.join(current_dir, "1jld_ligand.sdf")

    finder = dc.dock.ConvexHullPocketFinder()
    protein = md.load(protein_file)

    finder = dc.dock.ConvexHullPocketFinder()
    all_pockets = finder.find_all_pockets(protein_file)
    pockets, _, _ = finder.find_pockets(protein_file, ligand_file)
    pockets, pocket_atoms_map, pocket_coords = finder.find_pockets(
        protein_file, ligand_file)
    # Test that every atom in pocket maps exists
    n_protein_atoms = protein.xyz.shape[1]
    print("protein.xyz.shape")
    print(protein.xyz.shape)
    print("n_protein_atoms")
    print(n_protein_atoms)
    ############################################################## DEBUG
    #from deepchem.feat.grid_featurizer import load_molecule
    #protein_coords = load_molecule(protein_file, add_hydrogens=False)[0]
    #print("protein_coords.shape")
    #print(protein_coords.shape)
    ############################################################## DEBUG
    for pocket in pockets:
      pocket_atoms = pocket_atoms_map[pocket]
      for atom in pocket_atoms:
        # Check that the atoms is actually in protein
        assert atom >= 0
        assert atom < n_protein_atoms

    assert len(pockets) < len(all_pockets)

+21 −4
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@ import os
import pybel
import tempfile
import mdtraj as md
from deepchem.utils.save import log
from scipy.spatial import ConvexHull
from deepchem.feat import hydrogenate_and_compute_partial_charges
from deepchem.feat.atomic_coordinates import AtomicCoordinates
@@ -29,7 +30,10 @@ class BindingPocketFeaturizer(Featurizer):
              "ILE", "LEU", "LYS", "MET", "PHE", "PRO", "PYL", "SER", "SEC",
              "THR", "TRP", "TYR", "VAL", "ASX", "GLX"]

  def featurize(self, protein_file, pockets, pocket_atoms, pocket_coords):
  n_features = len(residues)

  def featurize(self, protein_file, pockets, pocket_atoms_map, pocket_coords,
                verbose=False):
    """
    Calculate atomic coodinates.
    """
@@ -39,12 +43,25 @@ class BindingPocketFeaturizer(Featurizer):
    res_map = dict(zip(BindingPocketFeaturizer.residues, range(n_residues)))
    all_features = np.zeros((n_pockets, n_residues)) 
    for pocket_num, (pocket, coords) in enumerate(zip(pockets, pocket_coords)):
      atoms = pocket_atoms[pocket]
      for atom in atoms:
      pocket_atoms = pocket_atoms_map[pocket]
      ################################################ DEBUG
      #print("len(pocket_atoms)")
      #print(len(pocket_atoms))
      #print("protein.top")
      #print(protein.top)
      ################################################ DEBUG
      for ind, atom in enumerate(pocket_atoms):
        atom_name = str(protein.top.atom(atom))
        ################################################ DEBUG
        #print("ind, atom, atom_name")
        #print(ind, atom, atom_name)
        ################################################ DEBUG
        # atom_name is of format RESX-ATOMTYPE
        # where X is a 1 to 4 digit number
        residue = atom_name[:3]
        if residue not in res_map:
          log("Warning: Non-standard residue in PDB file", verbose)
          continue
        atomtype = atom_name.split("-")[1]
        all_features[pocket_nu, res_map[residue]] += 1
        all_features[pocket_num, res_map[residue]] += 1
    return all_features 
+2 −2
Original line number Diff line number Diff line
@@ -48,7 +48,7 @@ def get_ligand_filetype(ligand_filename):
  else:
    raise ValueError("Unrecognized_filename")

def load_molecule(molecule_file, remove_hydrogens=True,
def load_molecule(molecule_file, add_hydrogens=True,
                  calc_charges=False):
  """Converts molecule file to (xyz-coords, obmol object)

@@ -85,7 +85,7 @@ def load_molecule(molecule_file, remove_hydrogens=True,
    ob_mol.UnsetImplicitValencePerceived()
    ob_mol.CorrectForPH(7.4)
    ob_mol.AddHydrogens()
  else:
  elif add_hydrogens:
    ob_mol.AddHydrogens()

  xyz = get_xyz_from_ob(ob_mol)
+186 −0
Original line number Diff line number Diff line
"""
PDBBind binding pocket dataset loader.
"""

from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import pandas as pd
import shutil
import time
import re
from rdkit import Chem
import deepchem as dc

def compute_binding_pocket_features(pocket_featurizer, ligand_featurizer,
                                    pdb_subdir, pdb_code, threshold=.3):
  """Compute features for a given complex"""
  protein_file = os.path.join(pdb_subdir, "%s_protein.pdb" % pdb_code)
  ligand_file = os.path.join(pdb_subdir, "%s_ligand.sdf" % pdb_code)
  ligand_mol2 = os.path.join(pdb_subdir, "%s_ligand.mol2" % pdb_code)

  # Extract active site
  active_site_box, active_site_atoms, active_site_coords = (
      dc.dock.binding_pocket.extract_active_site(
          protein_file, ligand_file))

  # Featurize ligand
  mol = Chem.MolFromMol2File(str(ligand_mol2), removeHs=False)
  if mol is None:
    return None, None
  # Default for CircularFingerprint
  n_ligand_features = 1024
  ligand_features = ligand_featurizer.featurize([mol])

  # Featurize pocket
  finder = dc.dock.ConvexHullPocketFinder()
  pockets, pocket_atoms, pocket_coords = finder.find_pockets(protein_file, ligand_file)
  n_pockets = len(pockets)
  n_pocket_features = dc.feat.BindingPocketFeaturizer.n_features

  features = np.zeros((n_pockets, n_pocket_features+n_ligand_features))
  pocket_features = pocket_featurizer.featurize(
      protein_file, pockets, pocket_atoms, pocket_coords)
  # Note broadcast operation
  features[:, n_pocket_features:] = ligand_features

  # Compute labels for pockets
  labels = np.zeros(n_pockets)
  pocket_atoms[active_site_box] = active_site_atoms
  for ind, pocket in enumerate(pockets):
    overlap = dc.dock.binding_pocket.compute_overlap(
        pocket_atoms, active_site_box, pocket)
    if overlap > threshold:
      labels[ind] = 1
    else:
      labels[ind] = 0 
  return features, labels

def load_pdbbind_labels(labels_file):
  """Loads pdbbind labels as dataframe"""
  # Some complexes have labels but no PDB files. Filter these manually
  missing_pdbs = ["1d2v", "1jou", "1s8j", "1cam", "4mlt", "4o7d"]
  contents = []
  with open(labels_file) as f:
    for line in f:
      if line.startswith("#"):
        continue
      else:
        # Some of the ligand-names are of form (FMN ox). Use regex
        # to merge into form (FMN-ox)
        p = re.compile('\(([^\)\s]*) ([^\)\s]*)\)')
        line = p.sub('(\\1-\\2)', line)
        elts = line.split()
        # Filter if missing PDB files
        if elts[0] in missing_pdbs:
          continue
        contents.append(elts)
  contents_df = pd.DataFrame(
      contents,
      columns=("PDB code", "resolution", "release year", "-logKd/Ki", "Kd/Ki",
               "ignore-this-field", "reference", "ligand name"))
  return contents_df

def featurize_pdbbind_pockets(data_dir=None, subset="core"):
  """Featurizes pdbbind according to provided featurization"""
  tasks = ["active-site"]
  current_dir = os.path.dirname(os.path.realpath(__file__))
  data_dir = os.path.join(current_dir, "%s_pockets" % (subset))
  if os.path.exists(data_dir):
    return dc.data.DiskDataset(data_dir), tasks
  pdbbind_dir = os.path.join(current_dir, "../pdbbind/v2015")

  # Load PDBBind dataset
  if subset == "core":
    labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
  elif subset == "refined":
    labels_file = os.path.join(pdbbind_dir, "INDEX_refined_data.2015")
  elif subset == "full":
    labels_file = os.path.join(pdbbind_dir, "INDEX_general_PL_data.2015")
  else:
    raise ValueError("Only core, refined, and full subsets supported.")
  print("About to load contents.")
  if not os.path.exists(labels_file):
    raise ValueError("Run ../pdbbind/get_pdbbind.sh to download dataset.")
  contents_df = load_pdbbind_labels(labels_file)
  ids = contents_df["PDB code"].values
  y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

  # Define featurizers
  pocket_featurizer = dc.feat.BindingPocketFeaturizer()
  ligand_featurizer = dc.feat.CircularFingerprint(size=1024)

  # Featurize Dataset
  all_features = []
  all_labels = []
  missing_pdbs = []
  all_ids = []
  time1 = time.time()
  for ind, pdb_code in enumerate(ids):
    print("Processing complex %d, %s" % (ind, str(pdb_code)))
    pdb_subdir = os.path.join(pdbbind_dir, pdb_code)
    if not os.path.exists(pdb_subdir):
      print("%s is missing!" % pdb_subdir)
      missing_pdbs.append(pdb_subdir)
      continue
    features, labels= compute_binding_pocket_features(
        pocket_featurizer, ligand_featurizer, pdb_subdir, pdb_code)
    if features is None:
      print("FEATURIZATION FAILED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
      continue
    all_features.append(features)
    all_labels.append(labels)
    ids = np.array(["%s%d" % (pdb_code, i) for i in range(len(labels))])
    all_ids.append(ids)
  time2 = time.time()
  print("TIMING: PDBBind Pocket Featurization took %0.3f s" % (time2-time1))
  ################################################################## DEBUG
  print("[features.shape for features in all_features]")
  print([features.shape for features in all_features])
  print("[labels.shape for labels in all_labels]")
  print([labels.shape for labels in all_labels])
  ################################################################## DEBUG
  X = np.vstack(all_features)
  y = np.concatenate(all_labels)
  w = np.ones_like(y)
  ids = np.concatenate(all_ids)
  ################################################################## DEBUG
  print("ids.shape")
  print(ids.shape)
  ################################################################## DEBUG

   
  dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids, data_dir=data_dir)
  return dataset, tasks

def load_pdbbind_pockets(split="index", subset="core"):
  """Load PDBBind datasets. Does not do train/test split"""
  dataset, tasks = featurize_pdbbind_pockets(subset=subset)

  splitters = {'index': dc.splits.IndexSplitter(),
               'random': dc.splits.RandomSplitter()}
  splitter = splitters[split]
  ########################################################### DEBUG
  print("dataset.X.shape")
  print(dataset.X.shape)
  print("dataset.y.shape")
  print(dataset.y.shape)
  print("dataset.w.shape")
  print(dataset.w.shape)
  print("dataset.ids.shape")
  print(dataset.ids.shape)
  ########################################################### DEBUG
  train, valid, test = splitter.train_valid_test_split(dataset)

  transformers = []
  for transformer in transformers:
    train = transformer.transform(train)
  for transformer in transformers:
    valid = transformer.transform(valid)
  for transformer in transformers:
    test = transformer.transform(test)
  
  return tasks, (train, valid, test), transformers
Loading