Merge pull request #231 from rbharath/complex_support (354bdb65) · Commits · 钟慕尧 / deepchem

deepchem/datasets/pdbbind_datasets.py

+61 −4

Original line number	Diff line number	Diff line
		@@ -17,6 +17,7 @@ from deepchem.featurizers.fingerprints import CircularFingerprint
		from deepchem.transformers import BalancingTransformer
		from deepchem.featurizers.nnscore import NNScoreComplexFeaturizer
		from deepchem.featurizers.grid_featurizer import GridFeaturizer
		from deepchem.featurizers.atomic_coordinates import NeighborListComplexAtomicCoordinates

		def load_pdbbind_labels(labels_file):
		"""Loads pdbbind labels as dataframe"""
		@@ -44,9 +45,7 @@ def compute_pdbbind_grid_feature(compound_featurizers, complex_featurizers,
		for complex_featurizer in complex_featurizers:
		features = complex_featurizer.featurize_complexes(
		[ligand_file], [protein_file])
		################################################ DEBUG
		all_features.append(np.squeeze(features))
		################################################ DEBUG

		for compound_featurizer in compound_featurizers:
		features = np.squeeze(compound_featurizer.featurize([rdkit_mol]))
		@@ -55,6 +54,66 @@ def compute_pdbbind_grid_feature(compound_featurizers, complex_featurizers,
		features = np.concatenate(all_features)
		return features

		def compute_pdbbind_coordinate_features(
		complex_featurizer, pdb_subdir, pdb_code):
		"""Compute features for a given complex"""
		protein_file = os.path.join(pdb_subdir, "%s_protein.pdb" % pdb_code)
		ligand_file = os.path.join(pdb_subdir, "%s_ligand.sdf" % pdb_code)

		feature = complex_featurizer.featurize_complexes(
		[ligand_file], [protein_file])
		return feature

		def load_core_pdbbind_coordinates(pdbbind_dir, base_dir, reload=True):
		"""Load PDBBind datasets. Does not do train/test split"""
		# Set some global variables up top
		reload = True
		verbosity = "high"
		model = "logistic"
		regen = False
		neighbor_cutoff = 4
		max_num_neighbors = 10

		# Create some directories for analysis
		# The base_dir holds the results of all analysis
		if not reload:
		if os.path.exists(base_dir):
		shutil.rmtree(base_dir)
		if not os.path.exists(base_dir):
		os.makedirs(base_dir)
		current_dir = os.path.dirname(os.path.realpath(__file__))
		#Make directories to store the raw and featurized datasets.
		data_dir = os.path.join(base_dir, "dataset")

		# Load PDBBind dataset
		labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
		pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set")
		tasks = ["-logKd/Ki"]
		print("About to load contents.")
		contents_df = load_pdbbind_labels(labels_file)
		ids = contents_df["PDB code"].values
		y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

		# Define featurizers
		featurizer = NeighborListComplexAtomicCoordinates(
		max_num_neighbors, neighbor_cutoff)

		# Featurize Dataset
		features = []
		for ind, pdb_code in enumerate(ids):
		print("Processing %s" % str(pdb_code))
		pdb_subdir = os.path.join(pdb_subdirs, pdb_code)
		computed_feature = compute_pdbbind_coordinate_features(
		featurizer, pdb_subdir, pdb_code)
		features.append(computed_feature)
		X = np.array(features, dtype-object)
		w = np.ones_like(y)

		dataset = Dataset.from_numpy(data_dir, X, y, w, ids)
		transformers = []

		return tasks, dataset, transformers

		def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True):
		"""Load PDBBind datasets. Does not do train/test split"""
		# Set some global variables up top
		@@ -113,9 +172,7 @@ def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True):
		continue
		y_inds.append(ind)
		features.append(computed_feature)
		############################################################# DEBUG
		y = y[y_inds]
		############################################################# DEBUG
		X = np.vstack(features)
		w = np.ones_like(y)

deepchem/datasets/tox21_datasets.py

+0 −13

Original line number	Diff line number	Diff line
		@@ -8,24 +8,11 @@ from __future__ import unicode_literals
		import os
		import numpy as np
		import shutil
		from sklearn.linear_model import LogisticRegression
		from sklearn.ensemble import RandomForestClassifier
		from deepchem.utils.save import load_from_disk
		from deepchem.datasets import Dataset
		from deepchem.featurizers.featurize import DataLoader
		from deepchem.featurizers.fingerprints import CircularFingerprint
		from deepchem.splits import ScaffoldSplitter
		from deepchem.splits import RandomSplitter
		from deepchem.datasets import Dataset
		from deepchem.transformers import BalancingTransformer
		from deepchem.hyperparameters import HyperparamOpt
		from deepchem.models.multitask import SingletaskToMultitask
		from deepchem import metrics
		from deepchem.metrics import Metric
		from deepchem.metrics import to_one_hot
		from deepchem.models.sklearn_models import SklearnModel
		from deepchem.utils.evaluate import relative_difference
		from deepchem.utils.evaluate import Evaluator

		def load_tox21(base_dir, reload=True):
		"""Load Tox21 datasets. Does not do train/test split"""

deepchem/featurizers/atomic_coordinates.py

+92 −43

Original line number	Diff line number	Diff line
		@@ -11,6 +11,9 @@ __license__ = "LGPL v2.1+"

		import numpy as np
		from deepchem.featurizers import Featurizer
		from deepchem.featurizers import ComplexFeaturizer
		from deepchem.featurizers.grid_featurizer import load_molecule
		from deepchem.featurizers.grid_featurizer import merge_molecules

		class AtomicCoordinates(Featurizer):
		"""
		@@ -45,6 +48,52 @@ class AtomicCoordinates(Featurizer):
		coords = [coords]
		return coords

		def compute_neighbor_list(coords, neighbor_cutoff, max_num_neighbors):
		"""Computes a neighbor list from atom coordinates."""
		N = coords.shape[0]
		x_bins, y_bins, z_bins = get_cells(coords, neighbor_cutoff)

		# Associate each atom with cell it belongs to. O(N)
		cell_to_atoms, atom_to_cell = put_atoms_in_cells(
		coords, x_bins, y_bins, z_bins)

		# Associate each cell with its neighbor cells. Assumes periodic boundary
		# conditions, so does wrapround. O(constant)
		N_x, N_y, N_z = len(x_bins), len(y_bins), len(z_bins)
		neighbor_cell_map = compute_neighbor_cell_map(N_x, N_y, N_z)

		# For each atom, loop through all atoms in its cell and neighboring cells.
		# Accept as neighbors only those within threshold. This computation should be
		# O(Nm), where m is the number of atoms within a set of neighboring-cells.
		neighbor_list = {}
		for atom in range(N):
		cell = atom_to_cell[atom]
		neighbor_cells = neighbor_cell_map[cell]
		# For smaller systems especially, the periodic boundary conditions can
		# result in neighboring cells being seen multiple times. Use a set() to
		# make sure duplicate neighbors are ignored. Convert back to list before
		# returning.
		neighbor_list[atom] = set()
		for neighbor_cell in neighbor_cells:
		atoms_in_cell = cell_to_atoms[neighbor_cell]
		for neighbor_atom in atoms_in_cell:
		if neighbor_atom == atom:
		continue
		# TODO(rbharath): How does distance need to be modified here to
		# account for periodic boundary conditions?
		dist = np.linalg.norm(coords[atom] - coords[neighbor_atom])
		if dist < neighbor_cutoff:
		neighbor_list[atom].add((neighbor_atom, dist))

		# Sort neighbors by distance
		closest_neighbors = sorted(
		list(neighbor_list[atom]), key=lambda elt: elt[1])
		closest_neighbors = [nbr for (nbr, dist) in closest_neighbors]
		# Pick up to max_num_neighbors
		closest_neighbors = closest_neighbors[:max_num_neighbors]
		neighbor_list[atom] = closest_neighbors
		return neighbor_list

		def get_cells(coords, neighbor_cutoff):
		"""Computes cells given molecular coordinates."""
		x_max, x_min = np.amax(coords[:, 0]), np.amin(coords[:, 0])
		@@ -187,53 +236,53 @@ class NeighborListAtomicCoordinates(Featurizer):

		Parameters
		----------
		mol: rdkit Mol
		To be featurized.
		"""
		N = mol.GetNumAtoms()
		# TODO(rbharath): Should this return a list?
		bohr_coords = self.coordinates_featurizer._featurize(mol)[0]
		coords = get_coords(mol)
		neighbor_list = compute_neighbor_list(
		coords, self.neighbor_cutoff, self.max_num_neighbors)

		x_bins, y_bins, z_bins = get_cells(coords, self.neighbor_cutoff)
		return (bohr_coords, neighbor_list)

		# Associate each atom with cell it belongs to. O(N)
		cell_to_atoms, atom_to_cell = put_atoms_in_cells(
		coords, x_bins, y_bins, z_bins)
		class NeighborListComplexAtomicCoordinates(ComplexFeaturizer):
		"""
		Adjacency list of neighbors for protein-ligand complexes in 3-space.

		# Associate each cell with its neighbor cells. Assumes periodic boundary
		# conditions, so does wrapround. O(constant)
		N_x, N_y, N_z = len(x_bins), len(y_bins), len(z_bins)
		neighbor_cell_map = compute_neighbor_cell_map(N_x, N_y, N_z)
		Neighbors dtermined by user-dfined distance cutoff.
		"""
		def __init__(self, max_num_neighbors=None, neighbor_cutoff=4):
		if neighbor_cutoff <= 0:
		raise ValueError("neighbor_cutoff must be positive value.")
		if max_num_neighbors is not None:
		if not isinstance(max_num_neighbors, int) or max_num_neighbors <= 0:
		raise ValueError("max_num_neighbors must be positive integer.")
		self.max_num_neighbors = max_num_neighbors
		self.neighbor_cutoff = neighbor_cutoff
		# Type of data created by this featurizer
		self.dtype = object
		self.coordinates_featurizer = AtomicCoordinates()

		# For each atom, loop through all atoms in its cell and neighboring cells.
		# Accept as neighbors only those within threshold. This computation should be
		# O(Nm), where m is the number of atoms within a set of neighboring-cells.
		neighbor_list = {}
		for atom in range(N):
		cell = atom_to_cell[atom]
		neighbor_cells = neighbor_cell_map[cell]
		# For smaller systems especially, the periodic boundary conditions can
		# result in neighboring cells being seen multiple times. Use a set() to
		# make sure duplicate neighbors are ignored. Convert back to list before
		# returning.
		neighbor_list[atom] = set()
		for neighbor_cell in neighbor_cells:
		atoms_in_cell = cell_to_atoms[neighbor_cell]
		for neighbor_atom in atoms_in_cell:
		if neighbor_atom == atom:
		continue
		# TODO(rbharath): How does distance need to be modified here to
		# account for periodic boundary conditions?
		dist = np.linalg.norm(coords[atom] - coords[neighbor_atom])
		if dist < self.neighbor_cutoff:
		neighbor_list[atom].add((neighbor_atom, dist))
		def _featurize_complex(self, mol_pdb_file, protein_pdb_file):
		"""
		Compute neighbor list for complex.

		# Sort neighbors by distance
		closest_neighbors = sorted(
		list(neighbor_list[atom]), key=lambda elt: elt[1])
		closest_neighbors = [nbr for (nbr, dist) in closest_neighbors]
		# Pick up to max_num_neighbors
		closest_neighbors = closest_neighbors[:self.max_num_neighbors]
		neighbor_list[atom] = closest_neighbors
		Parameters
		----------
		mol_pdb: list
		Should be a list of lines of the PDB file.
		complex_pdb: list
		Should be a list of lines of the PDB file.
		"""
		mol_coords, ob_mol = load_molecule(mol_pdb_file)
		protein_coords, protein_mol = load_molecule(protein_pdb_file)
		system_coords, system_mol = merge_molecules(
		mol_coords, ob_mol, protein_coords, protein_mol)

		system_neighbor_list = compute_neighbor_list(
		system_coords, self.neighbor_cutoff, self.max_num_neighbors)

		return (bohr_coords, neighbor_list)
		return (system_coords, system_neighbor_list)

deepchem/featurizers/grid_featurizer.py

+70 −60

Original line number	Diff line number	Diff line
		@@ -28,6 +28,74 @@ import time
		"""
		http://stackoverflow.com/questions/38987/how-can-i-merge-two-python-dictionaries-in-a-single-expression
		"""
		def get_xyz_from_ob(ob_mol):
		"""
		returns an m x 3 np array of 3d coords
		of given openbabel molecule
		"""

		xyz = np.zeros((ob_mol.NumAtoms(), 3))
		for i, atom in enumerate(ob.OBMolAtomIter(ob_mol)):
		xyz[i, 0] = atom.x()
		xyz[i, 1] = atom.y()
		xyz[i, 2] = atom.z()
		return(xyz)


		def load_molecule(molecule_file, remove_hydrogens=True,
		calc_charges=False):
		"""Converts molecule file to (xyz-coords, obmol object)

		Given molecule_file, returns a tuple of xyz coords of molecule
		and an openbabel object representing that molecule
		"""

		if ".mol2" in molecule_file:
		obConversion = ob.OBConversion()
		obConversion.SetInAndOutFormats(str("mol2"), str("mol2"))
		ob_mol = ob.OBMol()
		obConversion.ReadFile(ob_mol, str(molecule_file))
		elif ".sdf" in molecule_file:
		obConversion = ob.OBConversion()
		obConversion.SetInAndOutFormats(str("sdf"), str("sdf"))
		ob_mol = ob.OBMol()
		obConversion.ReadFile(ob_mol, str(molecule_file))
		elif ".pdb" in molecule_file:
		obConversion = ob.OBConversion()
		obConversion.SetInAndOutFormats(str("pdb"), str("pdb"))
		ob_mol = ob.OBMol()
		obConversion.ReadFile(ob_mol, str(molecule_file))
		else:
		raise ValueError("Unrecognized file type")

		if calc_charges:
		gasteiger = ob.OBChargeModel.FindType(str("gasteiger"))
		gasteiger.ComputeCharges(ob_mol)
		ob_mol.UnsetImplicitValencePerceived()
		ob_mol.CorrectForPH(7.4)
		ob_mol.AddHydrogens()
		else:
		ob_mol.AddHydrogens()

		xyz = get_xyz_from_ob(ob_mol)

		return xyz, ob_mol

		def merge_molecules(protein_xyz, protein, ligand_xyz, ligand):
		"""Merges coordinates of protein and ligand.

		Takes as input protein and ligand objects of class PDB and adds ligand atoms
		to the protein, and returns the new instance of class PDB called system that
		contains both sets of atoms.
		"""

		system_xyz = np.array(np.vstack(np.vstack((protein_xyz, ligand_xyz))))
		system_ob = ob.OBMol(protein)
		system_ob += ligand

		return system_xyz, system_ob


		def _merge_two_dicts(x, y):
		"""Given two dicts, merge them into a new dict as a shallow copy."""
		z = x.copy()
		@@ -749,20 +817,6 @@ def _convert_atom_pair_to_voxel(
		atom_index_pair[1], box_width, voxel_width)[0])
		return(indices_list)

		def _merge_molecules(protein_xyz, protein, ligand_xyz, ligand):
		"""Merges coordinates of protein and ligand.

		Takes as input protein and ligand objects of class PDB and adds ligand atoms
		to the protein, and returns the new instance of class PDB called system that
		contains both sets of atoms.
		"""

		system_xyz = np.array(np.vstack(np.vstack((protein_xyz, ligand_xyz))))
		system_ob = ob.OBMol(protein_ob)
		system_ob += ligand_ob

		return system_xyz, system_ob

		def _compute_charge_dictionary(molecule):
		"""Computes partial charges for each atom."""
		charge_dictionary = {}
		@@ -900,7 +954,7 @@ class GridFeaturizer(ComplexFeaturizer):
		"/")[len(str(protein_pdb).split("/")) - 2]

		if not self.ligand_only:
		protein_xyz, protein_ob = self._load_molecule(
		protein_xyz, protein_ob = load_molecule(
		protein_pdb, calc_charges=False)
		############################################################## TIMING
		time2 = time.time()
		@@ -910,7 +964,7 @@ class GridFeaturizer(ComplexFeaturizer):
		############################################################## TIMING
		time1 = time.time()
		############################################################## TIMING
		ligand_xyz, ligand_ob = self._load_molecule(
		ligand_xyz, ligand_ob = load_molecule(
		ligand_pdb, calc_charges=False)
		############################################################## TIMING
		time2 = time.time()
		@@ -1243,50 +1297,6 @@ class GridFeaturizer(ComplexFeaturizer):

		return feature_vector

		def _get_xyz_from_ob(self, ob_mol):
		"""
		returns an m x 3 np array of 3d coords
		of given openbabel molecule
		"""

		xyz = np.zeros((ob_mol.NumAtoms(), 3))
		for i, atom in enumerate(ob.OBMolAtomIter(ob_mol)):
		xyz[i, 0] = atom.x()
		xyz[i, 1] = atom.y()
		xyz[i, 2] = atom.z()
		return(xyz)

		def _load_molecule(self, molecule_file, remove_hydrogens=True,
		calc_charges=False):
		"""
		given molecule_file, returns a tuple of xyz coords of molecule
		and an openbabel object representing that molecule
		"""

		if ".mol2" in molecule_file:
		obConversion = ob.OBConversion()
		obConversion.SetInAndOutFormats(str("mol2"), str("mol2"))
		ob_mol = ob.OBMol()
		obConversion.ReadFile(ob_mol, molecule_file)
		else:
		obConversion = ob.OBConversion()
		obConversion.SetInAndOutFormats(str("pdb"), str("pdb"))
		ob_mol = ob.OBMol()
		obConversion.ReadFile(ob_mol, str(molecule_file))

		if calc_charges:
		gasteiger = ob.OBChargeModel.FindType(str("gasteiger"))
		gasteiger.ComputeCharges(ob_mol)
		ob_mol.UnsetImplicitValencePerceived()
		ob_mol.CorrectForPH(7.4)
		ob_mol.AddHydrogens()
		else:
		ob_mol.AddHydrogens()

		xyz = self._get_xyz_from_ob(ob_mol)

		return xyz, ob_mol

		def _generate_box(self, mol):
		"""Removes atoms outside box.

deepchem/featurizers/tests/test_atomic_coordinates.py

+18 −0

Original line number	Diff line number	Diff line
		"""
		Test atomic coordinates and neighbor lists.
		"""
		import os
		import numpy as np
		import unittest
		from rdkit import Chem
		@@ -11,6 +12,7 @@ from deepchem.featurizers.atomic_coordinates import put_atoms_in_cells
		from deepchem.featurizers.atomic_coordinates import compute_neighbor_cell_map
		from deepchem.featurizers.atomic_coordinates import AtomicCoordinates
		from deepchem.featurizers.atomic_coordinates import NeighborListAtomicCoordinates
		from deepchem.featurizers.atomic_coordinates import NeighborListComplexAtomicCoordinates

		class TestAtomicCoordinates(unittest.TestCase):
		"""
		@@ -228,3 +230,19 @@ class TestAtomicCoordinates(unittest.TestCase):
		assert nblist[i] == [closest_nbr]
		else:
		assert nblist[i] == []

		def test_complex_featurization_simple(self):
		"""Test Neighbor List computation on protein-ligand complex."""
		dir_path = os.path.dirname(os.path.realpath(__file__))
		ligand_file = os.path.join(dir_path, "data/3zso_ligand_hyd.pdb")
		protein_file = os.path.join(dir_path, "data/3zso_protein.pdb")
		max_num_neighbors = 4
		complex_featurizer = NeighborListComplexAtomicCoordinates(max_num_neighbors)

		system_coords, system_neighbor_list = complex_featurizer._featurize_complex(
		ligand_file, protein_file)

		N = system_coords.shape[0]
		assert len(system_neighbor_list.keys()) == N
		for atom in range(N):
		assert len(system_neighbor_list[atom]) <= max_num_neighbors

Admin message