Remove more files (5bd91993) · Commits · 钟慕尧 / deepchem

deepchem/feat/adjacency_fingerprints.py

deleted100755 → 0

+0 −168

Original line number	Diff line number	Diff line
		from collections import deque

		import sys
		import tensorflow as tf
		import pickle

		import os
		import fnmatch
		import numpy as np
		from scipy.spatial.distance import pdist, squareform
		import pandas as pd

		from deepchem.feat.base_classes import Featurizer
		from deepchem.feat.graph_features import atom_features
		from scipy.sparse import csr_matrix


		def get_atom_type(atom):
		elem = atom.GetAtomicNum()
		hyb = str(atom.GetHybridization).lower()
		if elem == 1:
		return (0)
		if elem == 4:
		return (1)
		if elem == 5:
		return (2)
		if elem == 6:
		if "sp2" in hyb:
		return (3)
		elif "sp3" in hyb:
		return (4)
		else:
		return (5)
		if elem == 7:
		if "sp2" in hyb:
		return (6)
		elif "sp3" in hyb:
		return (7)
		else:
		return (8)
		if elem == 8:
		if "sp2" in hyb:
		return (9)
		elif "sp3" in hyb:
		return (10)
		else:
		return (11)
		if elem == 9:
		return (12)
		if elem == 15:
		if "sp2" in hyb:
		return (13)
		elif "sp3" in hyb:
		return (14)
		else:
		return (15)
		if elem == 16:
		if "sp2" in hyb:
		return (16)
		elif "sp3" in hyb:
		return (17)
		else:
		return (18)
		if elem == 17:
		return (19)
		if elem == 35:
		return (20)
		if elem == 53:
		return (21)
		return (22)


		def get_atom_adj_matrices(mol,
		n_atom_types,
		max_n_atoms=200,
		max_valence=4,
		graph_conv_features=True,
		nxn=True):
		if not graph_conv_features:
		bond_matrix = np.zeros((max_n_atoms, 4 * max_valence)).astype(np.uint8)

		if nxn:
		adj_matrix = np.zeros((max_n_atoms, max_n_atoms)).astype(np.uint8)
		else:
		adj_matrix = np.zeros((max_n_atoms, max_valence)).astype(np.uint8)
		adj_matrix += (adj_matrix.shape[0] - 1)

		if not graph_conv_features:
		atom_matrix = np.zeros((max_n_atoms, n_atom_types + 3)).astype(np.uint8)
		atom_matrix[:, atom_matrix.shape[1] - 1] = 1

		atom_arrays = []
		for a_idx in range(0, mol.GetNumAtoms()):
		atom = mol.GetAtomWithIdx(a_idx)
		if graph_conv_features:
		atom_arrays.append(atom_features(atom))
		else:

		atom_type = get_atom_type(atom)
		atom_matrix[a_idx][-1] = 0
		atom_matrix[a_idx][atom_type] = 1

		for n_idx, neighbor in enumerate(atom.GetNeighbors()):
		if nxn:
		adj_matrix[a_idx][neighbor.GetIdx()] = 1
		adj_matrix[a_idx][a_idx] = 1
		else:
		adj_matrix[a_idx][n_idx] = neighbor.GetIdx()

		if not graph_conv_features:
		bond = mol.GetBondBetweenAtoms(a_idx, neighbor.GetIdx())
		bond_type = str(bond.GetBondType()).lower()
		if "single" in bond_type:
		bond_order = 0
		elif "double" in bond_type:
		bond_order = 1
		elif "triple" in bond_type:
		bond_order = 2
		elif "aromatic" in bond_type:
		bond_order = 3
		bond_matrix[a_idx][(4 * n_idx) + bond_order] = 1

		if graph_conv_features:
		n_feat = len(atom_arrays[0])
		atom_matrix = np.zeros((max_n_atoms, n_feat)).astype(np.uint8)
		for idx, atom_array in enumerate(atom_arrays):
		atom_matrix[idx, :] = atom_array
		else:
		atom_matrix = np.concatenate(
		[atom_matrix, bond_matrix], axis=1).astype(np.uint8)

		return (adj_matrix.astype(np.uint8), atom_matrix.astype(np.uint8))


		def featurize_mol(mol, n_atom_types, max_n_atoms, max_valence,
		num_atoms_feature):
		adj_matrix, atom_matrix = get_atom_adj_matrices(mol, n_atom_types,
		max_n_atoms, max_valence)
		if num_atoms_feature:
		return ((adj_matrix, atom_matrix, mol.GetNumAtoms()))
		return ((adj_matrix, atom_matrix))


		class AdjacencyFingerprint(Featurizer):

		def __init__(self,
		n_atom_types=23,
		max_n_atoms=200,
		add_hydrogens=False,
		max_valence=4,
		num_atoms_feature=False):
		self.n_atom_types = n_atom_types
		self.max_n_atoms = max_n_atoms
		self.add_hydrogens = add_hydrogens
		self.max_valence = max_valence
		self.num_atoms_feature = num_atoms_feature

		def featurize(self, rdkit_mols):
		featurized_mols = np.empty((len(rdkit_mols)), dtype=object)

		from rdkit import Chem
		for idx, mol in enumerate(rdkit_mols):
		if self.add_hydrogens:
		mol = Chem.AddHs(mol)
		featurized_mol = featurize_mol(mol, self.n_atom_types, self.max_n_atoms,
		self.max_valence, self.num_atoms_feature)
		featurized_mols[idx] = featurized_mol
		return (featurized_mols)

deepchem/feat/atomic_coordinates.py

deleted100644 → 0

+0 −305

Original line number	Diff line number	Diff line
		"""
		Atomic coordinate featurizer.
		"""
		__author__ = "Joseph Gomes and Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "MIT"

		import logging
		import numpy as np
		from deepchem.utils.save import log
		from deepchem.feat import Featurizer
		from deepchem.feat import ComplexFeaturizer
		from deepchem.utils import rdkit_util, pad_array
		from deepchem.utils.rdkit_util import MoleculeLoadException


		class AtomicCoordinates(Featurizer):
		"""
		Nx3 matrix of Cartesian coordinates [Angstrom]
		"""
		name = ['atomic_coordinates']

		def _featurize(self, mol):
		"""
		Calculate atomic coodinates.

		Parameters
		----------
		mol : RDKit Mol
		Molecule.
		"""

		N = mol.GetNumAtoms()
		coords = np.zeros((N, 3))

		# RDKit stores atomic coordinates in Angstrom. Atomic unit of length is the
		# bohr (1 bohr = 0.529177 Angstrom). Converting units makes gradient calculation
		# consistent with most QM software packages.
		coords_in_bohr = [
		mol.GetConformer(0).GetAtomPosition(i).__idiv__(0.52917721092)
		for i in range(N)
		]

		for atom in range(N):
		coords[atom, 0] = coords_in_bohr[atom].x
		coords[atom, 1] = coords_in_bohr[atom].y
		coords[atom, 2] = coords_in_bohr[atom].z

		coords = [coords]
		return coords


		def compute_neighbor_list(coords, neighbor_cutoff, max_num_neighbors,
		periodic_box_size):
		"""Computes a neighbor list from atom coordinates."""
		N = coords.shape[0]
		import mdtraj
		traj = mdtraj.Trajectory(coords.reshape((1, N, 3)), None)
		box_size = None
		if periodic_box_size is not None:
		box_size = np.array(periodic_box_size)
		traj.unitcell_vectors = np.array(
		[[[box_size[0], 0, 0], [0, box_size[1], 0], [0, 0, box_size[2]]]],
		dtype=np.float32)
		neighbors = mdtraj.geometry.compute_neighborlist(traj, neighbor_cutoff)
		neighbor_list = {}
		for i in range(N):
		if max_num_neighbors is not None and len(neighbors[i]) > max_num_neighbors:
		delta = coords[i] - coords.take(neighbors[i], axis=0)
		if box_size is not None:
		delta -= np.round(delta / box_size) * box_size
		dist = np.linalg.norm(delta, axis=1)
		sorted_neighbors = list(zip(dist, neighbors[i]))
		sorted_neighbors.sort()
		neighbor_list[i] = [
		sorted_neighbors[j][1] for j in range(max_num_neighbors)
		]
		else:
		neighbor_list[i] = list(neighbors[i])
		return neighbor_list


		def get_coords(mol):
		"""
		Gets coordinates in Angstrom for RDKit mol.
		"""
		N = mol.GetNumAtoms()
		coords = np.zeros((N, 3))

		coords_raw = [mol.GetConformer(0).GetAtomPosition(i) for i in range(N)]
		for atom in range(N):
		coords[atom, 0] = coords_raw[atom].x
		coords[atom, 1] = coords_raw[atom].y
		coords[atom, 2] = coords_raw[atom].z
		return coords


		class NeighborListAtomicCoordinates(Featurizer):
		"""
		Adjacency List of neighbors in 3-space

		Neighbors determined by user-defined distance cutoff [in Angstrom].

		https://en.wikipedia.org/wiki/Cell_list
		Ref: http://www.cs.cornell.edu/ron/references/1989/Calculations%20of%20a%20List%20of%20Neighbors%20in%20Molecular%20Dynamics%20Si.pdf

		Parameters
		----------
		neighbor_cutoff: float
		Threshold distance [Angstroms] for counting neighbors.
		periodic_box_size: 3 element array
		Dimensions of the periodic box in Angstroms, or None to not use periodic boundary conditions
		"""

		def __init__(self,
		max_num_neighbors=None,
		neighbor_cutoff=4,
		periodic_box_size=None):
		if neighbor_cutoff <= 0:
		raise ValueError("neighbor_cutoff must be positive value.")
		if max_num_neighbors is not None:
		if not isinstance(max_num_neighbors, int) or max_num_neighbors <= 0:
		raise ValueError("max_num_neighbors must be positive integer.")
		self.max_num_neighbors = max_num_neighbors
		self.neighbor_cutoff = neighbor_cutoff
		self.periodic_box_size = periodic_box_size
		# Type of data created by this featurizer
		self.dtype = object
		self.coordinates_featurizer = AtomicCoordinates()

		def _featurize(self, mol):
		"""
		Compute neighbor list.

		Parameters
		----------
		mol: rdkit Mol
		To be featurized.
		"""
		N = mol.GetNumAtoms()
		# TODO(rbharath): Should this return a list?
		bohr_coords = self.coordinates_featurizer._featurize(mol)[0]
		coords = get_coords(mol)
		neighbor_list = compute_neighbor_list(coords, self.neighbor_cutoff,
		self.max_num_neighbors,
		self.periodic_box_size)
		return (bohr_coords, neighbor_list)


		class NeighborListComplexAtomicCoordinates(ComplexFeaturizer):
		"""
		Adjacency list of neighbors for protein-ligand complexes in 3-space.

		Neighbors dtermined by user-dfined distance cutoff.
		"""

		def __init__(self, max_num_neighbors=None, neighbor_cutoff=4):
		if neighbor_cutoff <= 0:
		raise ValueError("neighbor_cutoff must be positive value.")
		if max_num_neighbors is not None:
		if not isinstance(max_num_neighbors, int) or max_num_neighbors <= 0:
		raise ValueError("max_num_neighbors must be positive integer.")
		self.max_num_neighbors = max_num_neighbors
		self.neighbor_cutoff = neighbor_cutoff
		# Type of data created by this featurizer
		self.dtype = object
		self.coordinates_featurizer = AtomicCoordinates()

		def _featurize_complex(self, mol_pdb_file, protein_pdb_file):
		"""
		Compute neighbor list for complex.

		Parameters
		----------
		mol_pdb_file: Str
		Filename for ligand pdb file.
		protein_pdb_file: Str
		Filename for protein pdb file.
		"""
		mol_coords, ob_mol = rdkit_util.load_molecule(mol_pdb_file)
		protein_coords, protein_mol = rdkit_util.load_molecule(protein_pdb_file)
		system_coords = rdkit_util.merge_molecules_xyz(mol_coords, protein_coords)

		system_neighbor_list = compute_neighbor_list(
		system_coords, self.neighbor_cutoff, self.max_num_neighbors, None)

		return (system_coords, system_neighbor_list)


		class ComplexNeighborListFragmentAtomicCoordinates(ComplexFeaturizer):
		"""This class computes the featurization that corresponds to AtomicConvModel.

		This class computes featurizations needed for AtomicConvModel. Given a
		two molecular structures, it computes a number of useful geometric
		features. In particular, for each molecule and the global complex, it
		computes a coordinates matrix of size (N_atoms, 3) where N_atoms is the
		number of atoms. It also computes a neighbor-list, a dictionary with
		N_atoms elements where neighbor-list[i] is a list of the atoms the i-th
		atom has as neighbors. In addition, it computes a z-matrix for the
		molecule which is an array of shape (N_atoms,) that contains the atomic
		number of that atom.

		Since the featurization computes these three quantities for each of the
		two molecules and the complex, a total of 9 quantities are returned for
		each complex. Note that for efficiency, fragments of the molecules can be
		provided rather than the full molecules themselves.
		"""

		def __init__(self,
		frag1_num_atoms,
		frag2_num_atoms,
		complex_num_atoms,
		max_num_neighbors,
		neighbor_cutoff,
		strip_hydrogens=True):
		self.frag1_num_atoms = frag1_num_atoms
		self.frag2_num_atoms = frag2_num_atoms
		self.complex_num_atoms = complex_num_atoms
		self.max_num_neighbors = max_num_neighbors
		self.neighbor_cutoff = neighbor_cutoff
		self.strip_hydrogens = strip_hydrogens
		self.neighborlist_featurizer = NeighborListComplexAtomicCoordinates(
		self.max_num_neighbors, self.neighbor_cutoff)

		def _featurize_complex(self, mol_pdb_file, protein_pdb_file):
		try:
		frag1_coords, frag1_mol = rdkit_util.load_molecule(mol_pdb_file)
		frag2_coords, frag2_mol = rdkit_util.load_molecule(protein_pdb_file)
		except MoleculeLoadException:
		# Currently handles loading failures by returning None
		# TODO: Is there a better handling procedure?
		logging.warning("Some molecules cannot be loaded by Rdkit. Skipping")
		return None
		system_mol = rdkit_util.merge_molecules(frag1_mol, frag2_mol)
		system_coords = rdkit_util.get_xyz_from_mol(system_mol)

		frag1_coords, frag1_mol = self._strip_hydrogens(frag1_coords, frag1_mol)
		frag2_coords, frag2_mol = self._strip_hydrogens(frag2_coords, frag2_mol)
		system_coords, system_mol = self._strip_hydrogens(system_coords, system_mol)

		try:
		frag1_coords, frag1_neighbor_list, frag1_z = self.featurize_mol(
		frag1_coords, frag1_mol, self.frag1_num_atoms)

		frag2_coords, frag2_neighbor_list, frag2_z = self.featurize_mol(
		frag2_coords, frag2_mol, self.frag2_num_atoms)

		system_coords, system_neighbor_list, system_z = self.featurize_mol(
		system_coords, system_mol, self.complex_num_atoms)
		except ValueError as e:
		logging.warning(
		"max_atoms was set too low. Some complexes too large and skipped")
		return None

		return frag1_coords, frag1_neighbor_list, frag1_z, frag2_coords, frag2_neighbor_list, frag2_z, \
		system_coords, system_neighbor_list, system_z

		def get_Z_matrix(self, mol, max_atoms):
		if len(mol.GetAtoms()) > max_atoms:
		raise ValueError("A molecule is larger than permitted by max_atoms. "
		"Increase max_atoms and try again.")
		return pad_array(
		np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()]), max_atoms)

		def featurize_mol(self, coords, mol, max_num_atoms):
		logging.info("Featurizing molecule of size: %d", len(mol.GetAtoms()))
		neighbor_list = compute_neighbor_list(coords, self.neighbor_cutoff,
		self.max_num_neighbors, None)
		z = self.get_Z_matrix(mol, max_num_atoms)
		z = pad_array(z, max_num_atoms)
		coords = pad_array(coords, (max_num_atoms, 3))
		return coords, neighbor_list, z

		def _strip_hydrogens(self, coords, mol):

		class MoleculeShim(object):
		"""
		Shim of a Molecule which supports #GetAtoms()
		"""

		def __init__(self, atoms):
		self.atoms = [AtomShim(x) for x in atoms]

		def GetAtoms(self):
		return self.atoms

		class AtomShim(object):

		def __init__(self, atomic_num):
		self.atomic_num = atomic_num

		def GetAtomicNum(self):
		return self.atomic_num

		if not self.strip_hydrogens:
		return coords, mol
		indexes_to_keep = []
		atomic_numbers = []
		for index, atom in enumerate(mol.GetAtoms()):
		if atom.GetAtomicNum() != 1:
		indexes_to_keep.append(index)
		atomic_numbers.append(atom.GetAtomicNum())
		mol = MoleculeShim(atomic_numbers)
		coords = coords[indexes_to_keep]
		return coords, mol

deepchem/feat/binding_pocket_features.py

deleted100644 → 0

+0 −53

Original line number	Diff line number	Diff line
		"""
		Featurizes proposed binding pockets.
		"""
		__author__ = "Bharath Ramsundar"
		__copyright__ = "Copyright 2017, Stanford University"
		__license__ = "MIT"

		import numpy as np
		from deepchem.utils.save import log
		from deepchem.feat import Featurizer


		class BindingPocketFeaturizer(Featurizer):
		"""
		Featurizes binding pockets with information about chemical environments.
		"""

		residues = [
		"ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE",
		"LEU", "LYS", "MET", "PHE", "PRO", "PYL", "SER", "SEC", "THR", "TRP",
		"TYR", "VAL", "ASX", "GLX"
		]

		n_features = len(residues)

		def featurize(self,
		protein_file,
		pockets,
		pocket_atoms_map,
		pocket_coords,
		verbose=False):
		"""
		Calculate atomic coodinates.
		"""
		import mdtraj
		protein = mdtraj.load(protein_file)
		n_pockets = len(pockets)
		n_residues = len(BindingPocketFeaturizer.residues)
		res_map = dict(zip(BindingPocketFeaturizer.residues, range(n_residues)))
		all_features = np.zeros((n_pockets, n_residues))
		for pocket_num, (pocket, coords) in enumerate(zip(pockets, pocket_coords)):
		pocket_atoms = pocket_atoms_map[pocket]
		for ind, atom in enumerate(pocket_atoms):
		atom_name = str(protein.top.atom(atom))
		# atom_name is of format RESX-ATOMTYPE
		# where X is a 1 to 4 digit number
		residue = atom_name[:3]
		if residue not in res_map:
		log("Warning: Non-standard residue in PDB file", verbose)
		continue
		atomtype = atom_name.split("-")[1]
		all_features[pocket_num, res_map[residue]] += 1
		return all_features

deepchem/feat/coulomb_matrices.py

deleted100644 → 0

+0 −271

File deleted.

Preview size limit exceeded, changes collapsed.

deepchem/feat/fingerprints.py

deleted100644 → 0

+0 −108

Original line number	Diff line number	Diff line
		"""
		Topological fingerprints.
		"""
		__author__ = "Steven Kearnes"
		__copyright__ = "Copyright 2014, Stanford University"
		__license__ = "MIT"

		from deepchem.feat import Featurizer


		class CircularFingerprint(Featurizer):
		"""
		Circular (Morgan) fingerprints.

		Parameters
		----------
		radius : int, optional (default 2)
		Fingerprint radius.
		size : int, optional (default 2048)
		Length of generated bit vector.
		chiral : bool, optional (default False)
		Whether to consider chirality in fingerprint generation.
		bonds : bool, optional (default True)
		Whether to consider bond order in fingerprint generation.
		features : bool, optional (default False)
		Whether to use feature information instead of atom information; see
		RDKit docs for more info.
		sparse : bool, optional (default False)
		Whether to return a dict for each molecule containing the sparse
		fingerprint.
		smiles : bool, optional (default False)
		Whether to calculate SMILES strings for fragment IDs (only applicable
		when calculating sparse fingerprints).
		"""
		name = 'circular'

		def __init__(self,
		radius=2,
		size=2048,
		chiral=False,
		bonds=True,
		features=False,
		sparse=False,
		smiles=False):
		self.radius = radius
		self.size = size
		self.chiral = chiral
		self.bonds = bonds
		self.features = features
		self.sparse = sparse
		self.smiles = smiles

		def _featurize(self, mol):
		"""
		Calculate circular fingerprint.

		Parameters
		----------
		mol : RDKit Mol
		Molecule.
		"""
		from rdkit import Chem
		from rdkit.Chem import rdMolDescriptors
		if self.sparse:
		info = {}
		fp = rdMolDescriptors.GetMorganFingerprint(
		mol,
		self.radius,
		useChirality=self.chiral,
		useBondTypes=self.bonds,
		useFeatures=self.features,
		bitInfo=info)
		fp = fp.GetNonzeroElements() # convert to a dict

		# generate SMILES for fragments
		if self.smiles:
		fp_smiles = {}
		for fragment_id, count in fp.items():
		root, radius = info[fragment_id][0]
		env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, root)
		frag = Chem.PathToSubmol(mol, env)
		smiles = Chem.MolToSmiles(frag)
		fp_smiles[fragment_id] = {'smiles': smiles, 'count': count}
		fp = fp_smiles
		else:
		fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(
		mol,
		self.radius,
		nBits=self.size,
		useChirality=self.chiral,
		useBondTypes=self.bonds,
		useFeatures=self.features)
		return fp

		def __hash__(self):
		return hash((self.radius, self.size, self.chiral, self.bonds, self.features,
		self.sparse, self.smiles))

		def __eq__(self, other):
		if not isinstance(self, other.__class__):
		return False
		return self.radius == other.radius and \
		self.size == other.size and \
		self.chiral == other.chiral and \
		self.bonds == other.bonds and \
		self.features == other.features and \
		self.sparse == other.sparse and \
		self.smiles == other.smiles

Admin message