surgery (944e980c) · Commits · 钟慕尧 / deepchem

deepchem/featurizers/init.py

0 → 100644

+256 −0

Original line number	Original line	Diff line number	Diff line
			"""
			Feature calculations.
			"""
			import types
			import numpy as np
			from rdkit import Chem
			from rdkit.Chem import rdGeometry, rdMolTransforms
			from deepchem.utils.ob_utils import Ionizer

			__author__ = "Steven Kearnes"
			__copyright__ = "Copyright 2014, Stanford University"
			__license__ = "BSD 3-clause"


			def get_featurizers():
			"""Compile a dict mapping strings to featurizer classes."""

			# import all Featurizer subclasses so __subclasses__ will work
			# these have to be local imports to avoid circular imports
			from .basic import MolecularWeight, SimpleDescriptors
			from .coulomb_matrices import CoulombMatrix
			from .dragon import DragonDescriptors
			from .esp import ESP
			from .fingerprints import CircularFingerprint
			from .images import MolImage
			from .scaffolds import ScaffoldGenerator
			from .shape_grid import ShapeGrid

			featurizers = {}
			for klass in Featurizer.__subclasses__():
			assert klass.name is not None, (klass.__name__ +
			" 'name' attribute is None.")
			if isinstance(klass.name, list):
			for name in klass.name:
			assert name not in featurizers
			featurizers[name] = klass
			else:
			assert klass.name not in featurizers
			featurizers[klass.name] = klass
			return featurizers


			def resolve_featurizer(name):
			"""
			Resolve featurizer class from a string.

			Parameters
			----------
			name : str
			Featurizer name.
			"""
			return get_featurizers()[name]

			class ComplexFeaturizer(object):
			""""
			Abstract class for calculating features for mol/protein complexes.

			Class Attributes
			----------------
			name : str or list
			Name (or names) of this featurizer (used for scripting).
			"""
			name = None

			def featurize_complexes(self, mol_pdbs, protein_pdbs, log_every_n=1000):
			"""
			Calculate features for mol/protein complexes.

			Parameters
			----------
			mol_pdbs: list
			List of PDBs for molecules. Each PDB should be a list of lines of the
			PDB file.
			protein_pdbs: list
			List of PDBs for proteins. Each PDB should be a list of lines of the
			PDB file.
			"""

			features = []
			for i, (mol_pdb, protein_pdb) in enumerate(zip(mol_pdbs, protein_pdbs)):
			if i % log_every_n == 0:
			print("Featurizing %d / %d" % (i, len(mol_pdbs)))
			features.append(self._featurize_complex(mol_pdb, protein_pdb))
			features = np.asarray(features)
			return features

			def _featurize_complex(self, mol_pdb, complex_pdb):
			"""
			Calculate features for single mol/protein complex.

			Parameters
			----------
			mol_pdb: list
			Should be a list of lines of the PDB file.
			complex_pdb: list
			Should be a list of lines of the PDB file.
			"""
			raise NotImplementedError('Featurizer is not defined.')

			class Featurizer(object):
			"""
			Abstract class for calculating a set of features for a molecule.

			Child classes implement the _featurize method for calculating features
			for a single molecule. The feature matrix returned by featurize has a
			shape that is inferred from the shape of the features returned by
			_featurize, and is either indexed by molecules or by molecules and
			conformers depending on the value of the conformers class attribute.

			Class Attributes
			----------------
			conformers : bool, optional (default False)
			Whether features are calculated for conformers. If True, the first
			two axes of the feature matrix will index molecules and conformers,
			respectively. If False, only molecule-level features are calculated
			and the feature matrix will not have a separate conformer dimension.
			This is a class attribute because some featurizers take 3D
			conformation into account and others do not, and this is not
			typically an instance-level decision.
			name : str or list
			Name (or names) of this featurizer (used for scripting).
			topo_view : bool (default False)
			Whether the calculated features represent a topological view of the
			data.
			"""
			conformers = False
			name = None
			topo_view = False

			def featurize(self, mols, parallel=False, client_kwargs=None,
			view_flags=None):
			"""
			Calculate features for molecules.

			Parameters
			----------
			mols : iterable
			RDKit Mol objects.
			parallel : bool, optional
			Whether to train subtrainers in parallel using
			IPython.parallel (default False).
			client_kwargs : dict, optional
			Keyword arguments for IPython.parallel Client.
			view_flags : dict, optional
			Flags for IPython.parallel LoadBalancedView.
			"""
			if self.conformers and isinstance(mols, types.GeneratorType):
			mols = list(mols)

			if parallel:
			from IPython.parallel import Client

			if client_kwargs is None:
			client_kwargs = {}
			if view_flags is None:
			view_flags = {}
			client = Client(**client_kwargs)
			client.direct_view().use_dill() # use dill
			view = client.load_balanced_view()
			view.set_flags(**view_flags)
			call = view.map(self._featurize, mols, block=False)
			features = call.get()

			# get output from engines
			call.display_outputs()

			else:
			features = [self._featurize(mol) for mol in mols]

			if self.conformers:
			features = self.conformer_container(mols, features)
			else:
			features = np.asarray(features)
			return features

			def _featurize(self, mol):
			"""
			Calculate features for a single molecule.

			Parameters
			----------
			mol : RDKit Mol
			Molecule.
			"""
			raise NotImplementedError('Featurizer is not defined.')

			def __call__(self, mols, parallel=False, client_kwargs=None,
			view_flags=None):
			"""
			Calculate features for molecules.

			Parameters
			----------
			mols : iterable
			RDKit Mol objects.
			parallel : bool, optional
			Whether to train subtrainers in parallel using
			IPython.parallel (default False).
			client_kwargs : dict, optional
			Keyword arguments for IPython.parallel Client.
			view_flags : dict, optional
			Flags for IPython.parallel LoadBalancedView.
			"""
			return self.featurize(mols, parallel, client_kwargs, view_flags)

			def conformer_container(self, mols, features):
			"""
			Put features into a container with an extra dimension for
			conformers. Conformer indices that are not used are masked.

			For example, if mols contains 3 molecules with 1, 2, 5 conformers,
			respectively, then the final container will have 3 entries on its
			first axis and 5 entries on its second axis. The remaining axes
			correspond to feature dimensions.

			Parameters
			----------
			mols : iterable
			RDKit Mol objects.
			features : list
			Features calculated for molecule conformers. Each element
			corresponds to the features for a molecule and should be an
			ndarray with conformers on the first axis.
			"""

			# get the maximum number of conformers
			max_confs = max([mol.GetNumConformers() for mol in mols])
			if not max_confs:
			max_confs = 1

			# construct the new container
			# - first axis = # mols
			# - second axis = max # conformers
			# - remaining axes = determined by feature shape
			features_shape = None
			for i in xrange(len(features)):
			for j in xrange(len(features[i])):
			if features[i][j] is not None:
			features_shape = features[i][0].shape
			break
			if features_shape is not None:
			break
			if features_shape is None:
			raise ValueError('Cannot find any features.')
			shape = (len(mols), max_confs) + features_shape
			x = np.ma.masked_all(shape)

			# fill in the container
			for i, (mol, mol_features) in enumerate(zip(mols, features)):
			n_confs = max(mol.GetNumConformers(), 1)
			try:
			x[i, :n_confs] = mol_features
			except ValueError: # handle None conformer values
			for j in xrange(n_confs):
			x[i, j] = mol_features[j]
			return x

deepchem/featurizers/basic.py

0 → 100644

+62 −0

Original line number	Original line	Diff line number	Diff line
			"""
			Basic molecular features.
			"""

			__author__ = "Steven Kearnes"
			__copyright__ = "Copyright 2014, Stanford University"
			__license__ = "BSD 3-clause"

			from rdkit.Chem import Descriptors

			from deepchem.featurizers import Featurizer


			class MolecularWeight(Featurizer):
			"""
			Molecular weight.
			"""
			name = ['mw', 'molecular_weight']

			def _featurize(self, mol):
			"""
			Calculate molecular weight.

			Parameters
			----------
			mol : RDKit Mol
			Molecule.
			"""
			wt = Descriptors.ExactMolWt(mol)
			wt = [wt]
			return wt


			class SimpleDescriptors(Featurizer):
			"""
			RDKit descriptors.

			See http://rdkit.org/docs/GettingStartedInPython.html
			#list-of-available-descriptors.
			"""
			name = 'descriptors'

			def __init__(self):
			self.descriptors = []
			self.functions = []
			for descriptor, function in Descriptors.descList:
			self.descriptors.append(descriptor)
			self.functions.append(function)

			def _featurize(self, mol):
			"""
			Calculate RDKit descriptors.

			Parameters
			----------
			mol : RDKit Mol
			Molecule.
			"""
			rval = []
			for function in self.functions:
			rval.append(function(mol))
			return rval

deepchem/featurizers/coulomb_matrices.py

0 → 100644

+156 −0

Original line number	Original line	Diff line number	Diff line
			"""
			Generate coulomb matrices for molecules.

			See Montavon et al., _New Journal of Physics_ __15__ (2013) 095003.
			"""

			__author__ = "Steven Kearnes"
			__copyright__ = "Copyright 2014, Stanford University"
			__license__ = "BSD 3-clause"

			import numpy as np

			from rdkit import Chem

			from deepchem.featurizers import Featurizer
			from vs_utils.utils import pad_array


			class CoulombMatrix(Featurizer):
			"""
			Calculate Coulomb matrices for molecules.

			Parameters
			----------
			max_atoms : int
			Maximum number of atoms for any molecule in the dataset. Used to
			pad the Coulomb matrix.
			remove_hydrogens : bool, optional (default True)
			Whether to remove hydrogens before constructing Coulomb matrix.
			randomize : bool, optional (default True)
			Whether to randomize Coulomb matrices to remove dependence on atom
			index order.
			n_samples : int, optional (default 1)
			Number of random Coulomb matrices to generate if randomize is True.
			seed : int, optional
			Random seed.
			"""
			conformers = True
			name = 'coulomb_matrix'

			def __init__(self, max_atoms, remove_hydrogens=True, randomize=True,
			n_samples=1, seed=None):
			self.max_atoms = int(max_atoms)
			self.remove_hydrogens = remove_hydrogens
			self.randomize = randomize
			self.n_samples = n_samples
			if seed is not None:
			seed = int(seed)
			self.seed = seed

			def _featurize(self, mol):
			"""
			Calculate Coulomb matrices for molecules. If extra randomized
			matrices are generated, they are treated as if they are features
			for additional conformers.

			Since Coulomb matrices are symmetric, only the (flattened) upper
			triangular portion is returned.

			Parameters
			----------
			mol : RDKit Mol
			Molecule.
			"""
			features = self.coulomb_matrix(mol)
			features = [f[np.triu_indices_from(f)] for f in features]
			features = np.asarray(features)
			return features

			def coulomb_matrix(self, mol):
			"""
			Generate Coulomb matrices for each conformer of the given molecule.

			Parameters
			----------
			mol : RDKit Mol
			Molecule.
			"""
			if self.remove_hydrogens:
			mol = Chem.RemoveHs(mol)
			n_atoms = mol.GetNumAtoms()
			z = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
			rval = []
			for conf in mol.GetConformers():
			d = self.get_interatomic_distances(conf)
			m = np.zeros((n_atoms, n_atoms))
			for i in xrange(mol.GetNumAtoms()):
			for j in xrange(mol.GetNumAtoms()):
			if i == j:
			m[i, j] = 0.5 * z[i] ** 2.4
			elif i < j:
			m[i, j] = (z[i] * z[j]) / d[i, j]
			m[j, i] = m[i, j]
			else:
			continue
			if self.randomize:
			for random_m in self.randomize_coulomb_matrix(m):
			random_m = pad_array(random_m, self.max_atoms)
			rval.append(random_m)
			else:
			m = pad_array(m, self.max_atoms)
			rval.append(m)
			rval = np.asarray(rval)
			return rval

			def randomize_coulomb_matrix(self, m):
			"""
			Randomize a Coulomb matrix as decribed in Montavon et al., _New Journal
			of Physics_ __15__ (2013) 095003:

			1. Compute row norms for M in a vector row_norms.
			2. Sample a zero-mean unit-variance noise vector e with dimension
			equal to row_norms.
			3. Permute the rows and columns of M with the permutation that
			sorts row_norms + e.

			Parameters
			----------
			m : ndarray
			Coulomb matrix.
			n_samples : int, optional (default 1)
			Number of random matrices to generate.
			seed : int, optional
			Random seed.
			"""
			rval = []
			row_norms = np.asarray([np.linalg.norm(row) for row in m], dtype=float)
			rng = np.random.RandomState(self.seed)
			for i in xrange(self.n_samples):
			e = rng.normal(size=row_norms.size)
			p = np.argsort(row_norms + e)
			new = m[p][:, p] # permute rows first, then columns
			rval.append(new)
			return rval

			@staticmethod
			def get_interatomic_distances(conf):
			"""
			Get interatomic distances for atoms in a molecular conformer.

			Parameters
			----------
			conf : RDKit Conformer
			Molecule conformer.
			"""
			n_atoms = conf.GetNumAtoms()
			coords = [conf.GetAtomPosition(i) for i in xrange(n_atoms)]
			d = np.zeros((n_atoms, n_atoms), dtype=float)
			for i in xrange(n_atoms):
			for j in xrange(n_atoms):
			if i < j:
			d[i, j] = coords[i].Distance(coords[j])
			d[j, i] = d[i, j]
			else:
			continue
			return d

deepchem/utils/featurize.py→deepchem/featurizers/featurize.py

+3 −3

Original line number	Original line	Diff line number	Diff line
	@@ -10,13 +10,13 @@ import pandas as pd
	import numpy as np		import numpy as np
	import csv		import csv
	from rdkit import Chem		from rdkit import Chem
	from vs_utils.features.fingerprints import CircularFingerprint		from deepchem.featurizers.fingerprints import CircularFingerprint
	from vs_utils.features.basic import SimpleDescriptors		from deepchem.featurizers.basic import SimpleDescriptors
	from deepchem.utils.save import save_to_disk		from deepchem.utils.save import save_to_disk
	from deepchem.utils.save import load_from_disk		from deepchem.utils.save import load_from_disk
	from deepchem.utils.save import load_pandas_from_disk		from deepchem.utils.save import load_pandas_from_disk
	from vs_utils.utils import ScaffoldGenerator		from vs_utils.utils import ScaffoldGenerator
	from vs_utils.features.nnscore import NNScoreComplexFeaturizer		from deepchem.featurizers.nnscore import NNScoreComplexFeaturizer
	import multiprocessing as mp		import multiprocessing as mp
	from functools import partial		from functools import partial

deepchem/featurizers/fingerprints.py

0 → 100644

+82 −0

Original line number	Original line	Diff line number	Diff line
			"""
			Topological fingerprints.
			"""

			__author__ = "Steven Kearnes"
			__copyright__ = "Copyright 2014, Stanford University"
			__license__ = "BSD 3-clause"

			from rdkit import Chem
			from rdkit.Chem import rdMolDescriptors

			from deepchem.featurizers import Featurizer


			class CircularFingerprint(Featurizer):
			"""
			Circular (Morgan) fingerprints.

			Parameters
			----------
			radius : int, optional (default 2)
			Fingerprint radius.
			size : int, optional (default 2048)
			Length of generated bit vector.
			chiral : bool, optional (default False)
			Whether to consider chirality in fingerprint generation.
			bonds : bool, optional (default True)
			Whether to consider bond order in fingerprint generation.
			features : bool, optional (default False)
			Whether to use feature information instead of atom information; see
			RDKit docs for more info.
			sparse : bool, optional (default False)
			Whether to return a dict for each molecule containing the sparse
			fingerprint.
			smiles : bool, optional (default False)
			Whether to calculate SMILES strings for fragment IDs (only applicable
			when calculating sparse fingerprints).
			"""
			name = 'circular'

			def __init__(self, radius=2, size=2048, chiral=False, bonds=True,
			features=False, sparse=False, smiles=False):
			self.radius = radius
			self.size = size
			self.chiral = chiral
			self.bonds = bonds
			self.features = features
			self.sparse = sparse
			self.smiles = smiles

			def _featurize(self, mol):
			"""
			Calculate circular fingerprint.

			Parameters
			----------
			mol : RDKit Mol
			Molecule.
			"""
			if self.sparse:
			info = {}
			fp = rdMolDescriptors.GetMorganFingerprint(
			mol, self.radius, useChirality=self.chiral,
			useBondTypes=self.bonds, useFeatures=self.features,
			bitInfo=info)
			fp = fp.GetNonzeroElements() # convert to a dict

			# generate SMILES for fragments
			if self.smiles:
			fp_smiles = {}
			for fragment_id, count in fp.items():
			root, radius = info[fragment_id][0]
			env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, root)
			frag = Chem.PathToSubmol(mol, env)
			smiles = Chem.MolToSmiles(frag)
			fp_smiles[fragment_id] = {'smiles': smiles, 'count': count}
			fp = fp_smiles
			else:
			fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(
			mol, self.radius, nBits=self.size, useChirality=self.chiral,
			useBondTypes=self.bonds, useFeatures=self.features)
			return fp

Admin message