No Openbabel in pdbbind benchmark (03ef391a) · Commits · 钟慕尧 / deepchem

deepchem/feat/init.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -18,6 +18,7 @@ from deepchem.feat.basic import RDKitDescriptors
		from deepchem.feat.coulomb_matrices import CoulombMatrix
		from deepchem.feat.coulomb_matrices import CoulombMatrixEig
		from deepchem.feat.grid_featurizer import GridFeaturizer
		from deepchem.feat.rdkit_grid_featurizer import RdkitGridFeaturizer
		from deepchem.feat.nnscore_utils import hydrogenate_and_compute_partial_charges
		from deepchem.feat.binding_pocket_features import BindingPocketFeaturizer
		from deepchem.feat.one_hot import OneHotFeaturizer

deepchem/feat/rdkit_grid_featurizer.py

0 → 100644

+1191 −0

File added.

Preview size limit exceeded, changes collapsed.

deepchem/trans/transformers.py

+1 −0

Original line number	Diff line number	Diff line
		# coding=utf-8
		"""
		Contains an abstract base class that supports data transformations.
		"""

deepchem/utils/rdkit_util.py

0 → 100644

+108 −0

Original line number	Diff line number	Diff line
		import logging

		import os
		import numpy as np
		import tempfile
		from rdkit import Chem
		from rdkit.Chem import AllChem
		from pdbfixer import PDBFixer
		from simtk.openmm.app import PDBFile


		class MoleculeLoadException(Exception):

		def __init__(self, args, *kwargs):
		Exception.__init__(args, *kwargs)


		def get_xyz_from_mol(mol):
		"""
		returns an m x 3 np array of 3d coords
		of given rdkit molecule
		"""
		xyz = np.zeros((mol.GetNumAtoms(), 3))
		conf = mol.GetConformer()
		for i in range(conf.GetNumAtoms()):
		position = conf.GetAtomPosition(i)
		xyz[i, 0] = position.x
		xyz[i, 1] = position.y
		xyz[i, 2] = position.z
		return (xyz)


		def add_hydrogens_f(mol):
		molecule_file = str(tempfile.NamedTemporaryFile().name)
		Chem.MolToPDBFile(mol, molecule_file)
		try:
		fixer = PDBFixer(filename=molecule_file)
		fixer.addMissingHydrogens(7.4)
		PDBFile.writeFile(fixer.topology, fixer.positions, open(molecule_file, 'w'))
		except ValueError as e:
		print(e)
		raise MoleculeLoadException(e)
		return Chem.MolFromPDBFile(str(molecule_file), sanitize=False, removeHs=False)


		def compute_charges(mol):
		try:
		AllChem.ComputeGasteigerCharges(mol)
		except Exception as e:
		logging.exception("Unable to compute charges for mol")
		raise MoleculeLoadException(e)
		return mol


		def load_molecule(molecule_file, add_hydrogens=True, calc_charges=True):
		"""Converts molecule file to (xyz-coords, obmol object)

		Given molecule_file, returns a tuple of xyz coords of molecule
		and an rdkit object representing that molecule
		"""
		if ".mol2" in molecule_file or ".sdf" in molecule_file:
		suppl = Chem.SDMolSupplier(str(molecule_file), sanitize=False)
		my_mol = suppl[0]
		elif ".pdbqt" in molecule_file:
		raise MoleculeLoadException("Don't support pdbqt files yet")
		elif ".pdb" in molecule_file:
		my_mol = Chem.MolFromPDBFile(str(molecule_file), sanitize=False, removeHs=False)
		else:
		raise ValueError("Unrecognized file type")

		if my_mol is None:
		raise ValueError("Unable to read non None Molecule Object")

		if calc_charges:
		my_mol = add_hydrogens_f(my_mol)
		compute_charges(my_mol)
		elif add_hydrogens:
		my_mol = add_hydrogens_f(my_mol)
		compute_charges(my_mol)

		xyz = get_xyz_from_mol(my_mol)

		return xyz, my_mol


		def write_molecule(mol, outfile):
		if ".pdbqt" in outfile:
		# TODO (LESWING) create writer for pdbqt which includes charges
		writer = Chem.PDBWriter(outfile)
		writer.write(mol)
		writer.close()
		pass
		elif ".pdb" in outfile:
		writer = Chem.PDBWriter(outfile)
		writer.write(mol)
		writer.close()
		else:
		raise ValueError("Unsupported Format")


		def pdbqt_to_pdb(filename):
		base_filename = os.path.splitext(filename)[0]
		pdb_filename = base_filename + ".pdb"
		pdbqt_data = open(filename).readlines()
		with open(pdb_filename, 'w') as fout:
		for line in pdbqt_data:
		fout.write("%s\n" % line[:66])
		return pdb_filename

examples/pdbbind/pdbbind_datasets.py

+49 −52

Original line number	Diff line number	Diff line
		@@ -2,40 +2,26 @@
		PDBBind dataset loader.
		"""

		from __future__ import print_function
		from __future__ import division
		from __future__ import print_function
		from __future__ import unicode_literals

		import logging
		import multiprocessing
		import os
		import numpy as np
		import pandas as pd
		import shutil
		import time
		import re
		from rdkit import Chem
		import time
		from multiprocessing.pool import Pool

		import deepchem as dc
		import numpy as np
		import pandas as pd
		from deepchem.utils.rdkit_util import MoleculeLoadException


		def load_pdbbind_labels(labels_file):
		"""Loads pdbbind labels as dataframe"""
		# Some complexes have labels but no PDB files. Filter these manually
		missing_pdbs = [
		"1d2v", "1jou", "1s8j", "3f39", "3i3d", "3i3b", "3dyo", "3t0d", "1cam",
		"3vdb", "3f37", "3f38", "4mlt", "3f36", "4o7d", "3t08", "3f34", "3f35",
		"2wik", "4mlx", "2wij", "1px4", "4wkt", "3f33", "2wig", "3muz", "3t2p",
		"3t2q", "4pji", "2adj", "3t09", "3mv0", "1pts", "3vd9", "3axk", "4q1s",
		"3t0b", "4b82", "3vd7", "3hg1", "3vd4", "3vdc", "3b5y", "4oi6", "3axm",
		"4mdm", "2mlm", "3eql", "4ob0", "3wi6", "4fgt", "4pnc", "4mvn", "4lv3",
		"4lz9", "1pyg", "3h1k", "7gpb", "1e8h", "4wku", "2f2h", "1zyr", "1z9j",
		"3b5d", "3b62", "4q3q", "4mdl", "4no6", "4mdg", "3dxj", "4u0x", "4l6q",
		"4q3r", "1h9s", "4ob1", "4ob2", "4qq5", "4nk3", "3k1j", "4m8t", "4mzo",
		"4nnn", "4q3s", "4nnw", "3cf1", "4u5t", "4wkv", "4ool", "3a2c", "4wm9",
		"4pkb", "4qkx", "4no8", "1ztz", "1nu1", "4kn4", "4mao", "4qqc", "4len",
		"4lv1", "4r02", "4r6v", "4fil", "4q2k", "1hpb", "4oon", "4qbb", "4ruu",
		"4no1", "3w8o", "4kn7", "4r17", "4r18", "5hvp", "1e59", "1sqq", "3n75",
		"4kmu", "4mzs", "1sqb", "1lr8", "4lv2", "4wmc", "1sqp", "3whw", "4cpa",
		"3i8w", "4hrd", "4hrc", "1ntk", "1rbo"
		]
		contents = []
		with open(labels_file) as f:
		for line in f:
		@@ -47,9 +33,6 @@ def load_pdbbind_labels(labels_file):
		p = re.compile('\(([^\)\s]) ([^\)\s])\)')
		line = p.sub('(\\1-\\2)', line)
		elts = line.split()
		# Filter if missing PDB files
		if elts[0] in missing_pdbs:
		continue
		contents.append(elts)
		contents_df = pd.DataFrame(
		contents,
		@@ -58,15 +41,6 @@ def load_pdbbind_labels(labels_file):
		return contents_df


		def compute_pdbbind_features(grid_featurizer, pdb_subdir, pdb_code):
		"""Compute features for a given complex"""
		protein_file = os.path.join(pdb_subdir, "%s_protein.pdb" % pdb_code)
		ligand_file = os.path.join(pdb_subdir, "%s_ligand.sdf" % pdb_code)
		features = grid_featurizer.featurize_complexes([ligand_file], [protein_file])
		features = np.squeeze(features)
		return features


		def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):
		"""Featurizes pdbbind according to provided featurization"""
		tasks = ["-logKd/Ki"]
		@@ -94,7 +68,7 @@ def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):

		# Define featurizers
		if feat == "grid":
		featurizer = dc.feat.GridFeaturizer(
		featurizer = dc.feat.RdkitGridFeaturizer(
		voxel_width=16.0,
		feature_types="voxel_combined",
		# TODO(rbharath, enf, leswing): Figure out why pi_stack and cation_pi
		@@ -116,30 +90,25 @@ def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):

		# Featurize Dataset
		features = []
		feature_len = None
		y_inds = []
		missing_pdbs = []
		time1 = time.time()
		p = Pool(multiprocessing.cpu_count())
		args = []
		for ind, pdb_code in enumerate(ids):
		print("Processing complex %d, %s" % (ind, str(pdb_code)))
		pdb_subdir = os.path.join(pdbbind_dir, pdb_code)
		if not os.path.exists(pdb_subdir):
		print("%s is missing!" % pdb_subdir)
		missing_pdbs.append(pdb_subdir)
		args.append((ind, pdb_code, pdbbind_dir, featurizer))
		results = p.map(compute_single_pdbbind_feature, args)
		feature_len = None
		for result in results:
		if result is None:
		continue
		computed_feature = compute_pdbbind_features(featurizer, pdb_subdir,
		pdb_code)
		if feature_len is None:
		feature_len = len(computed_feature)
		if len(computed_feature) != feature_len:
		print("Featurization failed for %s!" % pdb_code)
		feature_len = len(result[1])
		if len(result[1]) != feature_len:
		continue
		y_inds.append(ind)
		features.append(computed_feature)
		y_inds.append(result[0])
		features.append(result[1])
		time2 = time.time()
		print("TIMING: PDBBind Featurization took %0.3f s" % (time2 - time1))
		print("missing_pdbs")
		print(missing_pdbs)
		y = y[y_inds]
		X = np.vstack(features)
		w = np.ones_like(y)
		@@ -148,6 +117,31 @@ def featurize_pdbbind(data_dir=None, feat="grid", subset="core"):
		return dataset, tasks


		def compute_single_pdbbind_feature(x):
		ind, pdb_code, pdbbind_dir, featurizer = x[0], x[1], x[2], x[3]
		print("Processing complex %d, %s" % (ind, str(pdb_code)))
		pdb_subdir = os.path.join(pdbbind_dir, pdb_code)
		try:
		computed_feature = compute_pdbbind_features(featurizer, pdb_subdir,
		pdb_code)
		except MoleculeLoadException as e:
		logging.warning("Unable to compute features for %s" % x)
		return None
		except Exception as e:
		logging.warning("Unable to compute features for %s" % x)
		return None
		return ind, computed_feature

		def compute_pdbbind_features(grid_featurizer, pdb_subdir, pdb_code):
		"""Compute features for a given complex"""
		protein_file = os.path.join(pdb_subdir, "%s_protein.pdb" % pdb_code)
		ligand_file = os.path.join(pdb_subdir, "%s_ligand.sdf" % pdb_code)
		features = grid_featurizer.featurize_complexes([ligand_file],
		[protein_file])
		features = np.squeeze(features)
		return features


		def load_pdbbind_grid(split="index", featurizer="grid", subset="full"):
		"""Load PDBBind datasets. Does not do train/test split"""
		dataset, tasks = featurize_pdbbind(feat=featurizer, subset=subset)
		@@ -168,3 +162,6 @@ def load_pdbbind_grid(split="index", featurizer="grid", subset="full"):
		test = transformer.transform(test)

		return tasks, (train, valid, test), transformers

		if __name__ == "__main__":
		load_pdbbind_grid()

Admin message