Merge pull request #1992 from deepchem/featurizers (344b1ff7) · Commits · 钟慕尧 / deepchem

deepchem/feat/init.py

+1 −4

Original line number	Diff line number	Diff line
		"""
		Making it easy to import in classes.
		"""
		__author__ = "Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "MIT"

		from deepchem.feat.base_classes import Featurizer
		from deepchem.feat.base_classes import MolecularFeaturizer
		from deepchem.feat.base_classes import ComplexFeaturizer
		from deepchem.feat.base_classes import UserDefinedFeaturizer
		from deepchem.feat.graph_features import ConvMolFeaturizer

deepchem/feat/atomic_coordinates.py

+0 −4

Original line number	Diff line number	Diff line
		"""
		Atomic coordinate featurizer.
		"""
		__author__ = "Joseph Gomes and Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "MIT"

		import logging
		import numpy as np
		from deepchem.utils.save import log

deepchem/feat/base_classes.py

+104 −21

Original line number	Diff line number	Diff line
		@@ -6,9 +6,7 @@ import types
		import numpy as np
		import multiprocessing

		__author__ = "Steven Kearnes"
		__copyright__ = "Copyright 2014, Stanford University"
		__license__ = "BSD 3-clause"
		logger = logging.getLogger(__name__)


		def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message):
		@@ -16,6 +14,58 @@ def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message):
		return featurizer._featurize_complex(mol_pdb_file, protein_pdb_file)


		class Featurizer(object):
		"""Abstract class for calculating a set of features for a datapoint.

		This class is abstract and cannot be invoked directly. You'll
		likely only interact with this class if you're a developer. In
		that case, you might want to make a child class which
		implements the `_featurize` method for calculating features for
		a single datapoints if you'd like to make a featurizer for a
		new datatype.
		"""

		def featurize(self, datapoints, log_every_n=1000):
		"""Calculate features for datapoints.

		Parameters
		----------
		datapoints: iterable
		A sequence of objects that you'd like to featurize. Subclassses of
		`Featurizer` should instantiate the `_featurize` method that featurizes
		objects in the sequence.

		Returns
		-------
		A numpy array containing a featurized representation of `datapoints`.
		"""
		datapoints = list(datapoints)
		features = []
		for i, point in enumerate(datapoints):
		if i % log_every_n == 0:
		logger.info("Featurizing datapoint %i" % i)
		try:
		features.append(self._featurize(point))
		except:
		logger.warning(
		"Failed to featurize datapoint %d. Appending empty array")
		features.append(np.array([]))

		features = np.asarray(features)
		return features

		def __call__(self, datapoints):
		"""Calculate features for datapoints.

		Parameters
		----------
		datapoints: object
		Any blob of data you like. Subclasss should instantiate
		this.
		"""
		return self.featurize(datapoints)


		class ComplexFeaturizer(object):
		""""
		Abstract class for calculating features for mol/protein complexes.
		@@ -73,29 +123,62 @@ class ComplexFeaturizer(object):
		raise NotImplementedError('Featurizer is not defined.')


		class Featurizer(object):
		"""
		Abstract class for calculating a set of features for a molecule.
		class MolecularFeaturizer(Featurizer):
		"""Abstract class for calculating a set of features for a
		molecule.

		Child classes implement the _featurize method for calculating features
		for a single molecule.
		"""
		The defining feature of a `MolecularFeaturizer` is that it
		uses SMILES strings and RDKIT molecule objects to represent
		small molecules. All other featurizers which are subclasses of
		this class should plan to process input which comes as smiles
		strings or RDKIT molecules.

		def featurize(self, mols, verbose=True, log_every_n=1000):
		Child classes need to implement the _featurize method for
		calculating features for a single molecule.

		Note
		----
		In general, subclasses of this class will require RDKit to be installed.
		"""
		Calculate features for molecules.

		def featurize(self, molecules, log_every_n=1000):
		"""Calculate features for molecules.

		Parameters
		----------
		mols : iterable
		RDKit Mol objects.
		molecules: RDKit Mol / SMILES string /iterable
		RDKit Mol, or SMILES string or iterable sequence of RDKit mols/SMILES
		strings.

		Returns
		-------
		A numpy array containing a featurized representation of
		`datapoints`.
		"""
		mols = list(mols)
		try:
		from rdkit import Chem
		from rdkit.Chem.rdchem import Mol
		except ModuleNotFoundError:
		raise ValueError("This class requires RDKit to be installed.")
		# Special case handling of single molecule
		if isinstance(molecules, str) or isinstance(molecules, Mol):
		molecules = [molecules]
		else:
		# Convert iterables to list
		molecutes = list(molecules)
		features = []
		for i, mol in enumerate(mols):
		if mol is not None:
		for i, mol in enumerate(molecules):
		if i % log_every_n == 0:
		logger.info("Featurizing datapoint %i" % i)
		try:
		# Process only case of SMILES strings.
		if isinstance(mol, str):
		# mol must be a SMILES string so parse
		mol = Chem.MolFromSmiles(mol)
		features.append(self._featurize(mol))
		else:
		except:
		logger.warning(
		"Failed to featurize datapoint %d. Appending empty array")
		features.append(np.array([]))

		features = np.asarray(features)
		@@ -112,16 +195,16 @@ class Featurizer(object):
		"""
		raise NotImplementedError('Featurizer is not defined.')

		def __call__(self, mols):
		def __call__(self, molecules):
		"""
		Calculate features for molecules.

		Parameters
		----------
		mols : iterable
		RDKit Mol objects.
		molecules: iterable
		An iterable yielding RDKit Mol objects or SMILES strings.
		"""
		return self.featurize(mols)
		return self.featurize(molecules)


		class UserDefinedFeaturizer(Featurizer):

deepchem/feat/basic.py

+40 −16

Original line number	Diff line number	Diff line
		"""
		Basic molecular features.
		"""
		__author__ = "Steven Kearnes"
		__copyright__ = "Copyright 2014, Stanford University"
		__license__ = "MIT"

		from deepchem.feat import Featurizer
		import numpy as np
		from deepchem.feat.base_classes import MolecularFeaturizer


		class MolecularWeight(Featurizer):
		"""
		Molecular weight.
		class MolecularWeight(MolecularFeaturizer):
		"""Molecular weight.

		Note
		----
		This class requires RDKit to be installed.
		"""
		name = ['mw', 'molecular_weight']

		def _featurize(self, mol):
		"""
		@@ -22,21 +22,37 @@ class MolecularWeight(Featurizer):
		----------
		mol : RDKit Mol
		Molecule.

		Returns
		-------
		np.ndarray of length 1 containing the molecular weight.
		"""
		try:
		from rdkit.Chem import Descriptors
		except ModuleNotFoundError:
		raise ValueError("This class requires RDKit to be installed.")
		wt = Descriptors.ExactMolWt(mol)
		wt = [wt]
		return wt
		return np.asarray(wt)


		class RDKitDescriptors(Featurizer):
		"""
		RDKit descriptors.
		class RDKitDescriptors(MolecularFeaturizer):
		"""RDKit descriptors.

		This class comptues a list of chemical descriptors using RDKit.

		See http://rdkit.org/docs/GettingStartedInPython.html
		#list-of-available-descriptors.

		Attributes
		----------
		descriptors: np.ndarray
		1D array of RDKit descriptor names used in this class.

		Note
		----
		This class requires RDKit to be installed.
		"""
		name = 'descriptors'

		# (ytz): This is done to avoid future compatibility issues like inclusion of
		# the 3D descriptors or changing the feature size.
		@@ -69,9 +85,12 @@ class RDKitDescriptors(Featurizer):
		])

		def __init__(self):
		try:
		from rdkit.Chem import Descriptors
		except ModuleNotFoundError:
		raise ValueError("This class requires RDKit to be installed.")
		self.descriptors = []
		self.descList = []
		from rdkit.Chem import Descriptors
		for descriptor, function in Descriptors.descList:
		if descriptor in self.allowedDescriptors:
		self.descriptors.append(descriptor)
		@@ -85,8 +104,13 @@ class RDKitDescriptors(Featurizer):
		----------
		mol : RDKit Mol
		Molecule.

		Returns
		-------
		rval: np.ndarray
		1D array of RDKit descriptors for `mol`
		"""
		rval = []
		for desc_name, function in self.descList:
		rval.append(function(mol))
		return rval
		return np.asarray(rval)

deepchem/feat/coulomb_matrices.py

+95 −21

Original line number	Diff line number	Diff line
		@@ -3,24 +3,38 @@ Generate coulomb matrices for molecules.

		See Montavon et al., _New Journal of Physics_ __15__ (2013) 095003.
		"""
		__author__ = "Steven Kearnes"
		__copyright__ = "Copyright 2014, Stanford University"
		__license__ = "MIT"

		import numpy as np
		import deepchem as dc
		from deepchem.feat import Featurizer
		from deepchem.feat.base_classes import MolecularFeaturizer
		from deepchem.utils import pad_array
		from deepchem.feat.atomic_coordinates import AtomicCoordinates


		class BPSymmetryFunctionInput(Featurizer):
		"""
		Calculate Symmetry Function for each atom in the molecules
		Methods described in https://journals.aps.org/prl/pdf/10.1103/PhysRevLett.98.146401
		class BPSymmetryFunctionInput(MolecularFeaturizer):
		"""Calculate Symmetry Function for each atom in the molecules

		This method is described in [1]_

		References
		----------
		.. [1] Behler, Jörg, and Michele Parrinello. "Generalized neural-network
		representation of high-dimensional potential-energy surfaces." Physical
		review letters 98.14 (2007): 146401.

		Note
		----
		This class requires RDKit to be installed.
		"""

		def __init__(self, max_atoms):
		"""Initialize this featurizer.

		Parameters
		----------
		max_atoms: int
		The maximum number of atoms expected for molecules this featurizer will
		process.
		"""
		self.max_atoms = max_atoms

		def _featurize(self, mol):
		@@ -34,9 +48,11 @@ class BPSymmetryFunctionInput(Featurizer):
		return np.pad(features, ((0, self.max_atoms - n_atoms), (0, 0)), 'constant')


		class CoulombMatrix(Featurizer):
		"""
		Calculate Coulomb matrices for molecules.
		class CoulombMatrix(MolecularFeaturizer):
		"""Calculate Coulomb matrices for molecules.

		Coulomb matrices provide a representation of the electronic structure of a
		molecule. This method is described in [1]_.

		Parameters
		----------
		@@ -55,14 +71,24 @@ class CoulombMatrix(Featurizer):
		seed : int, optional
		Random seed.

		Example:

		Example
		-------
		>>> featurizers = dc.feat.CoulombMatrix(max_atoms=23)
		>>> input_file = 'deepchem/feat/tests/data/water.sdf' # really backed by water.sdf.csv
		>>> tasks = ["atomization_energy"]
		>>> loader = dc.data.SDFLoader(tasks, featurizer=featurizers)
		>>> dataset = loader.create_dataset(input_file) #doctest: +ELLIPSIS
		Reading structures from deepchem/feat/tests/data/water.sdf.

		References
		----------
		.. [1] Montavon, Grégoire, et al. "Learning invariant representations of
		molecules for atomization energy prediction." Advances in neural information
		processing systems. 2012.

		Note
		----
		This class requires RDKit to be installed.
		"""
		conformers = True
		name = 'coulomb_matrix'
		@@ -74,6 +100,28 @@ class CoulombMatrix(Featurizer):
		upper_tri=False,
		n_samples=1,
		seed=None):
		"""Initialize this featurizer.

		Parameters
		----------
		max_atoms: int
		The maximum number of atoms expected for molecules this featurizer will
		process.
		remove_hydrogens: bool, optional (default False)
		If True, remove hydrogens before processing them.
		randomize: bool, optional (default False)
		If True, use method `randomize_coulomb_matrices` to randomize Coulomb matrices.
		upper_tri: bool, optional (default False)
		Generate only upper triangle part of Coulomb matrices.
		n_samples: int, optional (default 1)
		If `randomize` is set to True, the number of random samples to draw.
		seed: int, optional (default None)
		Random seed to use.
		"""
		try:
		from rdkit import Chem
		except ModuleNotFoundError:
		raise ValueError("This class requires RDKit to be installed.")
		self.max_atoms = int(max_atoms)
		self.remove_hydrogens = remove_hydrogens
		self.randomize = randomize
		@@ -141,9 +189,7 @@ class CoulombMatrix(Featurizer):
		return rval

		def randomize_coulomb_matrix(self, m):
		"""
		Randomize a Coulomb matrix as decribed in Montavon et al.,
		New Journal of Physics, 15, (2013), 095003:
		"""Randomize a Coulomb matrix as decribed in [1]_:

		1. Compute row norms for M in a vector row_norms.
		2. Sample a zero-mean unit-variance noise vector e with dimension
		@@ -159,6 +205,10 @@ class CoulombMatrix(Featurizer):
		Number of random matrices to generate.
		seed : int, optional
		Random seed.

		References
		----------
		.. [1] Montavon et al., New Journal of Physics, 15, (2013), 095003
		"""
		rval = []
		row_norms = np.asarray([np.linalg.norm(row) for row in m], dtype=float)
		@@ -196,8 +246,10 @@ class CoulombMatrix(Featurizer):


		class CoulombMatrixEig(CoulombMatrix):
		"""
		Calculate the eigenvales of Coulomb matrices for molecules.
		"""Calculate the eigenvalues of Coulomb matrices for molecules.

		This featurizer computes the eigenvalues of the Coulomb matrices for provided
		molecules. Coulomb matrices are described in [1]_.

		Parameters
		----------
		@@ -214,14 +266,20 @@ class CoulombMatrixEig(CoulombMatrix):
		seed : int, optional
		Random seed.

		Example:

		Example
		-------
		>>> featurizers = dc.feat.CoulombMatrixEig(max_atoms=23)
		>>> input_file = 'deepchem/feat/tests/data/water.sdf' # really backed by water.sdf.csv
		>>> tasks = ["atomization_energy"]
		>>> loader = dc.data.SDFLoader(tasks, featurizer=featurizers)
		>>> dataset = loader.create_dataset(input_file) #doctest: +ELLIPSIS
		Reading structures from deepchem/feat/tests/data/water.sdf.

		References
		----------
		.. [1] Montavon, Grégoire, et al. "Learning invariant representations of
		molecules for atomization energy prediction." Advances in neural information
		processing systems. 2012.
		"""

		conformers = True
		@@ -233,6 +291,22 @@ class CoulombMatrixEig(CoulombMatrix):
		randomize=False,
		n_samples=1,
		seed=None):
		"""Initialize this featurizer.

		Parameters
		----------
		max_atoms: int
		The maximum number of atoms expected for molecules this featurizer will
		process.
		remove_hydrogens: bool, optional (default False)
		If True, remove hydrogens before processing them.
		randomize: bool, optional (default False)
		If True, use method `randomize_coulomb_matrices` to randomize Coulomb matrices.
		n_samples: int, optional (default 1)
		If `randomize` is set to True, the number of random samples to draw.
		seed: int, optional (default None)
		Random seed to use.
		"""
		self.max_atoms = int(max_atoms)
		self.remove_hydrogens = remove_hydrogens
		self.randomize = randomize

Admin message