Merge pull request #1840 from deepchem/nnscore (3a5addb1) · Commits · 钟慕尧 / deepchem

deepchem/dock/pose_generation.py

+10 −16

Original line number	Diff line number	Diff line
		@@ -12,7 +12,7 @@ import numpy as np
		import os
		import tempfile
		from subprocess import call
		from deepchem.feat import hydrogenate_and_compute_partial_charges
		from deepchem.utils.rdkit_util import add_hydrogens_to_mol
		from deepchem.dock.binding_pocket import RFConvexHullPocketFinder
		from deepchem.utils import rdkit_util

		@@ -91,14 +91,9 @@ class VinaPoseGenerator(PoseGenerator):

		# Prepare receptor
		receptor_name = os.path.basename(protein_file).split(".")[0]
		protein_hyd = os.path.join(out_dir, "%s.pdb" % receptor_name)
		protein_hyd = os.path.join(out_dir, "%s_hyd.pdb" % receptor_name)
		protein_pdbqt = os.path.join(out_dir, "%s.pdbqt" % receptor_name)
		hydrogenate_and_compute_partial_charges(
		protein_file,
		"pdb",
		hyd_output=protein_hyd,
		pdbqt_output=protein_pdbqt,
		protein=True)

		# Get protein centroid and range
		# TODO(rbharath): Need to add some way to identify binding pocket, or this is
		# going to be extremely slow!
		@@ -107,7 +102,10 @@ class VinaPoseGenerator(PoseGenerator):
		else:
		if not self.detect_pockets:
		receptor_mol = rdkit_util.load_molecule(
		protein_hyd, calc_charges=False, add_hydrogens=False)
		protein_file, calc_charges=True, add_hydrogens=True)
		rdkit_util.write_molecule(receptor_mol[1], protein_hyd, is_protein=True)
		rdkit_util.write_molecule(
		receptor_mol[1], protein_pdbqt, is_protein=True)
		protein_centroid = mol_xyz_util.get_molecule_centroid(receptor_mol[0])
		protein_range = mol_xyz_util.get_molecule_range(receptor_mol[0])
		box_dims = protein_range + 5.0
		@@ -129,16 +127,12 @@ class VinaPoseGenerator(PoseGenerator):

		# Prepare receptor
		ligand_name = os.path.basename(ligand_file).split(".")[0]
		ligand_hyd = os.path.join(out_dir, "%s.pdb" % ligand_name)
		ligand_pdbqt = os.path.join(out_dir, "%s.pdbqt" % ligand_name)

		# TODO(rbharath): Generalize this so can support mol2 files as well.
		hydrogenate_and_compute_partial_charges(
		ligand_file,
		"sdf",
		hyd_output=ligand_hyd,
		pdbqt_output=ligand_pdbqt,
		protein=False)
		ligand_mol = rdkit_util.load_molecule(
		ligand_file, calc_charges=True, add_hydrogens=True)
		rdkit_util.write_molecule(ligand_mol[1], ligand_pdbqt)
		# Write Vina conf file
		conf_file = os.path.join(out_dir, "conf.txt")
		write_conf(

deepchem/feat/init.py

+0 −1

Original line number	Diff line number	Diff line
		@@ -16,7 +16,6 @@ from deepchem.feat.coulomb_matrices import CoulombMatrix
		from deepchem.feat.coulomb_matrices import CoulombMatrixEig
		from deepchem.feat.coulomb_matrices import BPSymmetryFunctionInput
		from deepchem.feat.rdkit_grid_featurizer import RdkitGridFeaturizer
		from deepchem.feat.nnscore_utils import hydrogenate_and_compute_partial_charges
		from deepchem.feat.binding_pocket_features import BindingPocketFeaturizer
		from deepchem.feat.one_hot import OneHotFeaturizer
		from deepchem.feat.raw_featurizer import RawFeaturizer

deepchem/feat/nnscore_utils.py

deleted100644 → 0

+0 −535

Original line number	Diff line number	Diff line
		"""
		Helper Classes and Functions for docking fingerprint computation.
		"""
		__author__ = "Bharath Ramsundar and Jacob Durrant"
		__license__ = "GNU General Public License"

		import logging
		import math
		import os
		import subprocess
		import numpy as np
		import deepchem.utils.rdkit_util as rdkit_util


		def force_partial_charge_computation(mol):
		"""Force computation of partial charges for molecule.

		Parameters
		----------
		mol: Rdkit Mol
		Molecule on which we compute partial charges.
		"""
		rdkit_util.compute_charges(mol)


		def pdbqt_to_pdb(input_file, output_directory):
		"""Convert pdbqt file to pdb file.

		Parameters
		----------
		input_file: String
		Path to input file.
		output_directory: String
		Path to desired output directory.
		"""
		logging.info(input_file, output_directory)
		raise ValueError("Not yet implemented")


		def hydrogenate_and_compute_partial_charges(input_file,
		input_format,
		hyd_output=None,
		pdbqt_output=None,
		protein=True,
		verbose=True):
		"""Outputs a hydrogenated pdb and a pdbqt with partial charges.

		Takes an input file in specified format. Generates two outputs:

		-) A pdb file that contains a hydrogenated (at pH 7.4) version of
		original compound.
		-) A pdbqt file that has computed Gasteiger partial charges. This pdbqt
		file is build from the hydrogenated pdb.

		TODO(rbharath): Can do a bit of refactoring between this function and
		pdbqt_to_pdb.

		Parameters
		----------
		input_file: String
		Path to input file.
		input_format: String
		Name of input format.
		"""
		mol = rdkit_util.load_molecule(
		input_file, add_hydrogens=True, calc_charges=True)[1]
		if verbose:
		logging.info("Create pdb with hydrogens added")
		rdkit_util.write_molecule(mol, str(hyd_output), is_protein=protein)
		if verbose:
		logging.info("Create a pdbqt file from the hydrogenated pdb above.")
		rdkit_util.write_molecule(mol, str(pdbqt_output), is_protein=protein)

		if protein:
		logging.info("Removing ROOT/ENDROOT/TORSDOF")
		with open(pdbqt_output) as f:
		pdbqt_lines = f.readlines()
		filtered_lines = []
		for line in pdbqt_lines:

		filtered_lines.append(line)
		with open(pdbqt_output, "w") as f:
		f.writelines(filtered_lines)


		class AromaticRing(object):
		"""Holds information about an aromatic ring."""

		def __init__(self, center, indices, plane_coeff, radius):
		"""
		Initializes an aromatic.

		Parameters
		----------
		center: float
		Center of the ring.
		indices: list
		List of the atom indices for ring atoms.
		plane_coeff: list
		A list of elements [a, b, c, d] that define a plane by equation
		a x + b y + c z = d.
		radius: float
		Ring radius from center.
		"""
		self.center = center
		self.indices = indices
		# ax + by + c*z = dI think that
		self.plane_coeff = plane_coeff
		self.radius = radius


		def average_point(points):
		"""Returns the point with averaged coordinates of arguments.

		Parameters
		----------
		points: list
		List of point objects.
		Returns
		-------
		pavg: Point object
		Has coordinates the arithmetic average of those of p1 and p2.
		"""
		coords = np.array([0, 0, 0])
		for point in points:
		coords += point.as_array().astype(coords.dtype)
		if len(points) > 0:
		return Point(coords=coords / len(points))
		else:
		return Point(coords=coords)


		class Point(object):
		"""
		Simple implementation for a point in 3-space.
		"""

		def __init__(self, x=None, y=None, z=None, coords=None):
		"""
		Inputs can be specified either by explicitly providing x, y, z coords
		or by providing a numpy array of length 3.

		Parameters
		----------
		x: float
		X-coord.
		y: float
		Y-coord.
		z: float
		Z-coord.
		coords: np.ndarray
		Should be of length 3 in format np.array([x, y, z])
		Raises
		------
		ValueError: If no arguments are provided.
		"""
		if x and y and z:
		#self.x, self.y, self.z = x, y, z
		self.coords = np.array([x, y, z])
		elif coords is not None: # Implicit eval doesn't work on numpy arrays.
		#self.x, self.y, self.z = coords[0], coords[1], coords[2]
		self.coords = coords
		else:
		raise ValueError("Must specify coordinates for Point!")

		# TODO(bramsundar): Should this be __copy__?
		def copy_of(self):
		"""Return a copy of this point."""
		return Point(coords=np.copy(self.coords))

		def dist_to(self, point):
		"""Distance (in 2-norm) from this point to another."""
		return np.linalg.norm(self.coords - point.coords)

		def magnitude(self):
		"""Magnitude of this point (in 2-norm)."""
		return np.linalg.norm(self.coords)
		#return self.dist_to(Point(coords=np.array([0, 0, 0])))

		def as_array(self):
		"""Return the coordinates of this point as array."""
		#return np.array([self.x, self.y, self.z])
		return self.coords


		class Atom(object):
		"""
		Implements a container class for atoms. This class contains useful
		annotations about the atom.
		"""

		def __init__(self,
		atomname="",
		residue="",
		coordinates=Point(coords=np.array([99999, 99999, 99999])),
		element="",
		pdb_index="",
		line="",
		atomtype="",
		indices_of_atoms_connecting=None,
		charge=0,
		resid=0,
		chain="",
		structure="",
		comment=""):
		"""
		Initializes an atom.

		Assumes that atom is loaded from a PDB file.

		Parameters
		----------
		atomname: string
		Name of atom. Note that atomname is not the same as residue since
		atomnames often have extra annotations (e.g., CG, NZ, etc).
		residue: string:
		Name of protein residue this atom belongs to.
		element: string
		Name of atom's element.
		coordinate: point
		A point object (x, y, z are in Angstroms).
		pdb_index: string
		Index of the atom in source PDB file.
		line: string
		The line in the PDB file which specifies this atom.
		atomtype: string
		Element of atom. This differs from atomname which typically has extra
		annotations (e.g. CA, OA, HD, etc)
		IndicesOfAtomConnecting: list
		The indices (in a PDB object) of all atoms bonded to this one.
		charge: float
		Associated electrostatic charge.
		resid: int
		The residue number in the receptor (listing the protein as a chain from
		N-Terminus to C-Terminus). Assumes this is a protein atom.
		chain: string
		Chain identifier for molecule. See PDB spec.
		structure: string
		One of ALPHA, BETA, or OTHER for the type of protein secondary
		structure this atom resides in (assuming this is a receptor atom).
		comment: string
		Either LIGAND or RECEPTOR depending on whether this is a ligand or
		receptor atom.
		"""
		self.atomname = atomname
		self.residue = residue
		self.coordinates = coordinates
		self.element = element
		self.pdb_index = pdb_index
		self.line = line
		self.atomtype = atomtype
		if indices_of_atoms_connecting is not None:
		self.indices_of_atoms_connecting = indices_of_atoms_connecting
		else:
		self.indices_of_atoms_connecting = []
		self.charge = charge
		self.resid = resid
		self.chain = chain
		self.structure = structure
		self.comment = comment

		def copy_of(self):
		"""Make a copy of this atom."""
		theatom = Atom()
		theatom.atomname = self.atomname
		theatom.residue = self.residue
		theatom.coordinates = self.coordinates.copy_of()
		theatom.element = self.element
		theatom.pdb_index = self.pdb_index
		theatom.line = self.line
		theatom.atomtype = self.atomtype
		theatom.indices_of_atoms_connecting = self.indices_of_atoms_connecting[:]
		theatom.charge = self.charge
		theatom.resid = self.resid
		theatom.chain = self.chain
		theatom.structure = self.structure
		theatom.comment = self.comment

		return theatom

		def create_pdb_line(self, index):
		"""
		Generates appropriate ATOM line for pdb file.

		Parameters
		----------
		index: int
		Index in associated PDB file.
		"""
		output = "ATOM "
		output = (
		output + str(index).rjust(6) + self.atomname.rjust(5) +
		self.residue.rjust(4) + self.chain.rjust(2) + str(self.resid).rjust(4))
		coords = self.coordinates.as_array() # [x, y, z]
		output = output + ("%.3f" % coords[0]).rjust(12)
		output = output + ("%.3f" % coords[1]).rjust(8)
		output = output + ("%.3f" % coords[2]).rjust(8)
		output = output + self.element.rjust(24)
		return output

		def number_of_neighbors(self):
		"""Reports number of neighboring atoms."""
		return len(self.indices_of_atoms_connecting)

		def add_neighbor_atom_indices(self, indices):
		"""
		Adds atoms with provided PDB indices as neighbors.

		Parameters
		----------
		index: list
		List of indices of neighbors in PDB object.
		"""
		for index in indices:
		if index not in self.indices_of_atoms_connecting:
		self.indices_of_atoms_connecting.append(index)

		def side_chain_or_backbone(self):
		"""Determine whether receptor atom belongs to residue sidechain or backbone.
		"""
		# TODO(rbharath): Should this be an atom function?
		if (self.atomname.strip() == "CA" or self.atomname.strip() == "C" or
		self.atomname.strip() == "O" or self.atomname.strip() == "N"):
		return "BACKBONE"
		else:
		return "SIDECHAIN"

		def read_atom_pdb_line(self, line):
		"""
		TODO(rbharath): This method probably belongs in the PDB class, and not
		in the Atom class.

		Reads an ATOM or HETATM line from PDB and instantiates fields.

		Atoms in PDBs are represented by ATOM or HETATM statements. ATOM and
		HETATM statements follow the following record format:

		(see ftp://ftp.wwpdb.org/pub/pdb/doc/format_descriptions/Format_v33_Letter.pdf)

		COLUMNS DATA TYPE FIELD DEFINITION
		-------------------------------------------------------------------------------------
		1 - 6 Record name "ATOM "/"HETATM"
		7 - 11 Integer serial Atom serial number.
		13 - 16 Atom name Atom name.
		17 Character altLoc Alternate location indicator.
		18 - 20 Residue name resName Residue name.
		22 Character chainID Chain identifier.
		23 - 26 Integer resSeq Residue sequence number.
		27 AChar iCode Code for insertion of residues.
		31 - 38 Real(8.3) x Orthogonal coordinates for X in Angstroms.
		39 - 46 Real(8.3) y Orthogonal coordinates for Y in Angstroms.
		47 - 54 Real(8.3) z Orthogonal coordinates for Z in Angstroms.
		55 - 60 Real(6.2) occupancy Occupancy.
		61 - 66 Real(6.2) tempFactor Temperature factor.
		77 - 78 LString(2) element Element symbol, right-justified.
		79 - 80 LString(2) charge Charge on the atom.
		"""
		self.line = line
		self.atomname = line[11:16].strip()

		if len(self.atomname) == 1:
		self.atomname = self.atomname + " "
		elif len(self.atomname) == 2:
		self.atomname = self.atomname + " "
		elif len(self.atomname) == 3:
		# This line is necessary for babel to work, though many PDBs in
		# the PDB would have this line commented out
		self.atomname = self.atomname + " "

		self.coordinates = Point(
		coords=np.array(
		[float(line[30:38]),
		float(line[38:46]),
		float(line[46:54])]))

		# now atom type (for pdbqt)
		if line[77:79].strip():
		self.atomtype = line[77:79].strip().upper()
		elif self.atomname:
		# If atomtype is not specified, but atomname is, set atomtype to the
		# first letter of atomname. This heuristic suffices for proteins,
		# since no two-letter elements appear in standard amino acids.
		self.atomtype = self.atomname[:1]
		else:
		self.atomtype = ""

		if line[69:76].strip() != "":
		self.charge = float(line[69:76])
		else:
		self.charge = 0.0

		if self.element == "": # try to guess at element from name
		two_letters = self.atomname[0:2].strip().upper()
		valid_two_letters = [
		"BR", "CL", "BI", "AS", "AG", "LI", "HG", "MG", "MN", "RH", "ZN", "FE"
		]
		if two_letters in valid_two_letters:
		self.element = two_letters
		else: #So, just assume it's the first letter.
		# Any number needs to be removed from the element name
		self.element = self.atomname
		self.element = self.element.replace('0', '')
		self.element = self.element.replace('1', '')
		self.element = self.element.replace('2', '')
		self.element = self.element.replace('3', '')
		self.element = self.element.replace('4', '')
		self.element = self.element.replace('5', '')
		self.element = self.element.replace('6', '')
		self.element = self.element.replace('7', '')
		self.element = self.element.replace('8', '')
		self.element = self.element.replace('9', '')
		self.element = self.element.replace('@', '')

		self.element = self.element[0:1].strip().upper()

		self.pdb_index = line[6:12].strip()
		self.residue = line[16:20]
		# this only uses the rightmost three characters, essentially
		# removing unique rotamer identification
		self.residue = " " + self.residue[-3:]

		if line[23:26].strip() != "":
		self.resid = int(line[23:26])
		else:
		self.resid = 1

		self.chain = line[21:22]
		if self.residue.strip() == "":
		self.residue = " MOL"


		class Charged(object):
		"""
		A class that represeents a charged atom.
		"""

		def __init__(self, coordinates, indices, positive):
		"""
		Parameters
		----------
		coordinates: point
		Coordinates of atom.
		indices: list
		Contains boolean true or false entries for self and neighbors to
		specify if positive or negative charge
		positive: bool
		Whether this atom is positive or negative.
		"""
		self.coordinates = coordinates
		self.indices = indices
		self.positive = positive


		def vector_subtraction(point1, point2): # point1 - point2
		"""Subtracts the coordinates of the provided points."""
		return Point(coords=point1.as_array() - point2.as_array())


		def cross_product(point1, point2): # never tested
		"""Calculates the cross-product of provided points."""
		return Point(coords=np.cross(point1.as_array(), point2.as_array()))


		def vector_scalar_multiply(point, scalar):
		"""Multiplies the provided point by scalar."""
		return Point(coords=scalar * point.as_array())


		def dot_product(point1, point2):
		"""Dot product of points."""
		return np.dot(point1.as_array(), point2.as_array())


		def dihedral(point1, point2, point3, point4): # never tested
		"""Compute dihedral angle between 4 points.

		TODO(rbharath): Write a nontrivial test for this.
		"""

		b1 = vector_subtraction(point2, point1)
		b2 = vector_subtraction(point3, point2)
		b3 = vector_subtraction(point4, point3)

		b2Xb3 = cross_product(b2, b3)
		b1Xb2 = cross_product(b1, b2)

		b1XMagb2 = vector_scalar_multiply(b1, b2.magnitude())
		radians = math.atan2(dot_product(b1XMagb2, b2Xb3), dot_product(b1Xb2, b2Xb3))
		return radians


		def angle_between_three_points(point1, point2, point3):
		"""Computes the angle (in radians) between the three provided points."""
		return angle_between_points(
		vector_subtraction(point1, point2), vector_subtraction(point3, point2))


		def angle_between_points(point1, point2):
		"""Computes the angle (in radians) between two points."""
		return math.acos(
		dot_product(point1, point2) / (point1.magnitude() * point2.magnitude()))


		def normalized_vector(point):
		"""Normalize provided point."""
		return Point(coords=point.as_array() / np.linalg.norm(point.as_array()))


		def distance(point1, point2):
		"""Computes distance between two points."""
		return point1.dist_to(point2)


		def project_point_onto_plane(point, plane_coefficients):
		"""Finds nearest point on specified plane to given point.

		Parameters
		----------
		point: Point
		Given point
		plane_coefficients: list
		[a, b, c, d] where place equation is ax + by + cz = d
		"""
		# The normal vector to plane is n = [a, b, c]
		offset = plane_coefficients[3]
		normal = np.array(plane_coefficients[:3])
		# We first shift by basepoint (a point on given plane) to make math
		# simpler. basepoint is given by d/\|\|n\|\|^2 * n
		basepoint = (offset / np.linalg.norm(normal)*2) normal
		diff = point.as_array() - basepoint
		# The perpendicular component of diff to plane is
		# (n^T diff / \|\|n\|\|^2) * n
		perp = (np.dot(normal, diff) / np.linalg.norm(normal)*2) normal
		closest = basepoint + (diff - perp)
		return Point(coords=np.array(closest))

Admin message