Preliminary binding pocket dataset (aa3d7f65) · Commits · 钟慕尧 / deepchem

deepchem/dock/binding_pocket.py

+9 −5

Original line number	Diff line number	Diff line
		@@ -21,8 +21,8 @@ from subprocess import call

		def extract_active_site(protein_file, ligand_file, cutoff=4):
		"""Extracts a box for the active site."""
		protein_coords = load_molecule(protein_file)[0]
		ligand_coords = load_molecule(ligand_file)[0]
		protein_coords = load_molecule(protein_file, add_hydrogens=False)[0]
		ligand_coords = load_molecule(ligand_file, add_hydrogens=False)[0]
		num_ligand_atoms = len(ligand_coords)
		num_protein_atoms = len(protein_coords)
		pocket_inds = []
		@@ -179,13 +179,17 @@ class ConvexHullPocketFinder(BindingPocketFinder):
		def find_all_pockets(self, protein_file):
		"""Find list of binding pockets on protein."""
		# protein_coords is (N, 3) tensor
		coords = load_molecule(protein_file)[0]
		coords = load_molecule(protein_file, add_hydrogens=False)[0]
		return get_all_boxes(coords, self.pad)

		def find_pockets(self, protein_file, ligand_file):
		"""Find list of suitable binding pockets on protein."""
		protein_coords = load_molecule(protein_file)[0]
		ligand_coords = load_molecule(ligand_file)[0]
		protein_coords = load_molecule(protein_file, add_hydrogens=False)[0]
		######################################################################### DEBUG
		#print("protein_coords")
		#print(protein_coords)
		######################################################################### DEBUG
		ligand_coords = load_molecule(ligand_file, add_hydrogens=False)[0]
		boxes = get_all_boxes(protein_coords, self.pad)
		mapping = boxes_to_atoms(protein_coords, boxes)
		pockets, pocket_atoms = merge_overlapping_boxes(mapping, boxes)

deepchem/dock/tests/test_binding_pocket.py

+27 −6

Original line number	Diff line number	Diff line
		@@ -9,6 +9,7 @@ __author__ = "Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "GPL"

		import mdtraj as md
		import unittest
		import tempfile
		import os
		@@ -16,12 +17,12 @@ import shutil
		import numpy as np
		import deepchem as dc

		class TestPoseGeneration(unittest.TestCase):
		class TestBindingPocket(unittest.TestCase):
		"""
		Does sanity checks on pose generation.
		Does sanity checks on binding pocket generation.
		"""

		def test_convex_rf_init(self):
		def test_convex_init(self):
		"""Tests that ConvexHullPocketFinder can be initialized."""
		finder = dc.dock.ConvexHullPocketFinder()

		@@ -115,16 +116,36 @@ class TestPoseGeneration(unittest.TestCase):
		assert len(merged_boxes) == 1
		assert merged_boxes[0] == ((1, 3), (1, 3), (1, 3))

		def test_convex_rf_find_pockets(self):
		def test_convex_find_pockets(self):
		"""Test that some pockets are filtered out."""
		current_dir = os.path.dirname(os.path.realpath(__file__))
		protein_file = os.path.join(current_dir, "1jld_protein.pdb")
		ligand_file = os.path.join(current_dir, "1jld_ligand.sdf")

		finder = dc.dock.ConvexHullPocketFinder()
		protein = md.load(protein_file)

		finder = dc.dock.ConvexHullPocketFinder()
		all_pockets = finder.find_all_pockets(protein_file)
		pockets, _, _ = finder.find_pockets(protein_file, ligand_file)
		pockets, pocket_atoms_map, pocket_coords = finder.find_pockets(
		protein_file, ligand_file)
		# Test that every atom in pocket maps exists
		n_protein_atoms = protein.xyz.shape[1]
		print("protein.xyz.shape")
		print(protein.xyz.shape)
		print("n_protein_atoms")
		print(n_protein_atoms)
		############################################################## DEBUG
		#from deepchem.feat.grid_featurizer import load_molecule
		#protein_coords = load_molecule(protein_file, add_hydrogens=False)[0]
		#print("protein_coords.shape")
		#print(protein_coords.shape)
		############################################################## DEBUG
		for pocket in pockets:
		pocket_atoms = pocket_atoms_map[pocket]
		for atom in pocket_atoms:
		# Check that the atoms is actually in protein
		assert atom >= 0
		assert atom < n_protein_atoms

		assert len(pockets) < len(all_pockets)

deepchem/feat/binding_pocket_features.py

+21 −4

Original line number	Diff line number	Diff line
		@@ -14,6 +14,7 @@ import os
		import pybel
		import tempfile
		import mdtraj as md
		from deepchem.utils.save import log
		from scipy.spatial import ConvexHull
		from deepchem.feat import hydrogenate_and_compute_partial_charges
		from deepchem.feat.atomic_coordinates import AtomicCoordinates
		@@ -29,7 +30,10 @@ class BindingPocketFeaturizer(Featurizer):
		"ILE", "LEU", "LYS", "MET", "PHE", "PRO", "PYL", "SER", "SEC",
		"THR", "TRP", "TYR", "VAL", "ASX", "GLX"]

		def featurize(self, protein_file, pockets, pocket_atoms, pocket_coords):
		n_features = len(residues)

		def featurize(self, protein_file, pockets, pocket_atoms_map, pocket_coords,
		verbose=False):
		"""
		Calculate atomic coodinates.
		"""
		@@ -39,12 +43,25 @@ class BindingPocketFeaturizer(Featurizer):
		res_map = dict(zip(BindingPocketFeaturizer.residues, range(n_residues)))
		all_features = np.zeros((n_pockets, n_residues))
		for pocket_num, (pocket, coords) in enumerate(zip(pockets, pocket_coords)):
		atoms = pocket_atoms[pocket]
		for atom in atoms:
		pocket_atoms = pocket_atoms_map[pocket]
		################################################ DEBUG
		#print("len(pocket_atoms)")
		#print(len(pocket_atoms))
		#print("protein.top")
		#print(protein.top)
		################################################ DEBUG
		for ind, atom in enumerate(pocket_atoms):
		atom_name = str(protein.top.atom(atom))
		################################################ DEBUG
		#print("ind, atom, atom_name")
		#print(ind, atom, atom_name)
		################################################ DEBUG
		# atom_name is of format RESX-ATOMTYPE
		# where X is a 1 to 4 digit number
		residue = atom_name[:3]
		if residue not in res_map:
		log("Warning: Non-standard residue in PDB file", verbose)
		continue
		atomtype = atom_name.split("-")[1]
		all_features[pocket_nu, res_map[residue]] += 1
		all_features[pocket_num, res_map[residue]] += 1
		return all_features

deepchem/feat/grid_featurizer.py

+2 −2

Original line number	Diff line number	Diff line
		@@ -48,7 +48,7 @@ def get_ligand_filetype(ligand_filename):
		else:
		raise ValueError("Unrecognized_filename")

		def load_molecule(molecule_file, remove_hydrogens=True,
		def load_molecule(molecule_file, add_hydrogens=True,
		calc_charges=False):
		"""Converts molecule file to (xyz-coords, obmol object)

		@@ -85,7 +85,7 @@ def load_molecule(molecule_file, remove_hydrogens=True,
		ob_mol.UnsetImplicitValencePerceived()
		ob_mol.CorrectForPH(7.4)
		ob_mol.AddHydrogens()
		else:
		elif add_hydrogens:
		ob_mol.AddHydrogens()

		xyz = get_xyz_from_ob(ob_mol)

examples/binding_pockets/binding_pocket_datasets.py

0 → 100644

+186 −0

Original line number	Diff line number	Diff line
		"""
		PDBBind binding pocket dataset loader.
		"""

		from __future__ import print_function
		from __future__ import division
		from __future__ import unicode_literals

		import os
		import numpy as np
		import pandas as pd
		import shutil
		import time
		import re
		from rdkit import Chem
		import deepchem as dc

		def compute_binding_pocket_features(pocket_featurizer, ligand_featurizer,
		pdb_subdir, pdb_code, threshold=.3):
		"""Compute features for a given complex"""
		protein_file = os.path.join(pdb_subdir, "%s_protein.pdb" % pdb_code)
		ligand_file = os.path.join(pdb_subdir, "%s_ligand.sdf" % pdb_code)
		ligand_mol2 = os.path.join(pdb_subdir, "%s_ligand.mol2" % pdb_code)

		# Extract active site
		active_site_box, active_site_atoms, active_site_coords = (
		dc.dock.binding_pocket.extract_active_site(
		protein_file, ligand_file))

		# Featurize ligand
		mol = Chem.MolFromMol2File(str(ligand_mol2), removeHs=False)
		if mol is None:
		return None, None
		# Default for CircularFingerprint
		n_ligand_features = 1024
		ligand_features = ligand_featurizer.featurize([mol])

		# Featurize pocket
		finder = dc.dock.ConvexHullPocketFinder()
		pockets, pocket_atoms, pocket_coords = finder.find_pockets(protein_file, ligand_file)
		n_pockets = len(pockets)
		n_pocket_features = dc.feat.BindingPocketFeaturizer.n_features

		features = np.zeros((n_pockets, n_pocket_features+n_ligand_features))
		pocket_features = pocket_featurizer.featurize(
		protein_file, pockets, pocket_atoms, pocket_coords)
		# Note broadcast operation
		features[:, n_pocket_features:] = ligand_features

		# Compute labels for pockets
		labels = np.zeros(n_pockets)
		pocket_atoms[active_site_box] = active_site_atoms
		for ind, pocket in enumerate(pockets):
		overlap = dc.dock.binding_pocket.compute_overlap(
		pocket_atoms, active_site_box, pocket)
		if overlap > threshold:
		labels[ind] = 1
		else:
		labels[ind] = 0
		return features, labels

		def load_pdbbind_labels(labels_file):
		"""Loads pdbbind labels as dataframe"""
		# Some complexes have labels but no PDB files. Filter these manually
		missing_pdbs = ["1d2v", "1jou", "1s8j", "1cam", "4mlt", "4o7d"]
		contents = []
		with open(labels_file) as f:
		for line in f:
		if line.startswith("#"):
		continue
		else:
		# Some of the ligand-names are of form (FMN ox). Use regex
		# to merge into form (FMN-ox)
		p = re.compile('\(([^\)\s]) ([^\)\s])\)')
		line = p.sub('(\\1-\\2)', line)
		elts = line.split()
		# Filter if missing PDB files
		if elts[0] in missing_pdbs:
		continue
		contents.append(elts)
		contents_df = pd.DataFrame(
		contents,
		columns=("PDB code", "resolution", "release year", "-logKd/Ki", "Kd/Ki",
		"ignore-this-field", "reference", "ligand name"))
		return contents_df

		def featurize_pdbbind_pockets(data_dir=None, subset="core"):
		"""Featurizes pdbbind according to provided featurization"""
		tasks = ["active-site"]
		current_dir = os.path.dirname(os.path.realpath(__file__))
		data_dir = os.path.join(current_dir, "%s_pockets" % (subset))
		if os.path.exists(data_dir):
		return dc.data.DiskDataset(data_dir), tasks
		pdbbind_dir = os.path.join(current_dir, "../pdbbind/v2015")

		# Load PDBBind dataset
		if subset == "core":
		labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
		elif subset == "refined":
		labels_file = os.path.join(pdbbind_dir, "INDEX_refined_data.2015")
		elif subset == "full":
		labels_file = os.path.join(pdbbind_dir, "INDEX_general_PL_data.2015")
		else:
		raise ValueError("Only core, refined, and full subsets supported.")
		print("About to load contents.")
		if not os.path.exists(labels_file):
		raise ValueError("Run ../pdbbind/get_pdbbind.sh to download dataset.")
		contents_df = load_pdbbind_labels(labels_file)
		ids = contents_df["PDB code"].values
		y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

		# Define featurizers
		pocket_featurizer = dc.feat.BindingPocketFeaturizer()
		ligand_featurizer = dc.feat.CircularFingerprint(size=1024)

		# Featurize Dataset
		all_features = []
		all_labels = []
		missing_pdbs = []
		all_ids = []
		time1 = time.time()
		for ind, pdb_code in enumerate(ids):
		print("Processing complex %d, %s" % (ind, str(pdb_code)))
		pdb_subdir = os.path.join(pdbbind_dir, pdb_code)
		if not os.path.exists(pdb_subdir):
		print("%s is missing!" % pdb_subdir)
		missing_pdbs.append(pdb_subdir)
		continue
		features, labels= compute_binding_pocket_features(
		pocket_featurizer, ligand_featurizer, pdb_subdir, pdb_code)
		if features is None:
		print("FEATURIZATION FAILED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
		continue
		all_features.append(features)
		all_labels.append(labels)
		ids = np.array(["%s%d" % (pdb_code, i) for i in range(len(labels))])
		all_ids.append(ids)
		time2 = time.time()
		print("TIMING: PDBBind Pocket Featurization took %0.3f s" % (time2-time1))
		################################################################## DEBUG
		print("[features.shape for features in all_features]")
		print([features.shape for features in all_features])
		print("[labels.shape for labels in all_labels]")
		print([labels.shape for labels in all_labels])
		################################################################## DEBUG
		X = np.vstack(all_features)
		y = np.concatenate(all_labels)
		w = np.ones_like(y)
		ids = np.concatenate(all_ids)
		################################################################## DEBUG
		print("ids.shape")
		print(ids.shape)
		################################################################## DEBUG


		dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids, data_dir=data_dir)
		return dataset, tasks

		def load_pdbbind_pockets(split="index", subset="core"):
		"""Load PDBBind datasets. Does not do train/test split"""
		dataset, tasks = featurize_pdbbind_pockets(subset=subset)

		splitters = {'index': dc.splits.IndexSplitter(),
		'random': dc.splits.RandomSplitter()}
		splitter = splitters[split]
		########################################################### DEBUG
		print("dataset.X.shape")
		print(dataset.X.shape)
		print("dataset.y.shape")
		print(dataset.y.shape)
		print("dataset.w.shape")
		print(dataset.w.shape)
		print("dataset.ids.shape")
		print(dataset.ids.shape)
		########################################################### DEBUG
		train, valid, test = splitter.train_valid_test_split(dataset)

		transformers = []
		for transformer in transformers:
		train = transformer.transform(train)
		for transformer in transformers:
		valid = transformer.transform(valid)
		for transformer in transformers:
		test = transformer.transform(test)

		return tasks, (train, valid, test), transformers

Admin message