changes (067cc2d8) · Commits · 钟慕尧 / deepchem

deepchem/feat/init.py

+1 −4

Original line number	Diff line number	Diff line
		"""
		Making it easy to import in classes.
		"""
		__author__ = "Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "MIT"

		from deepchem.feat.base_classes import Featurizer
		from deepchem.feat.base_classes import MolecularFeaturizer
		from deepchem.feat.base_classes import ComplexFeaturizer
		from deepchem.feat.base_classes import UserDefinedFeaturizer
		from deepchem.feat.graph_features import ConvMolFeaturizer

deepchem/feat/atomic_coordinates.py

+0 −4

Original line number	Diff line number	Diff line
		"""
		Atomic coordinate featurizer.
		"""
		__author__ = "Joseph Gomes and Bharath Ramsundar"
		__copyright__ = "Copyright 2016, Stanford University"
		__license__ = "MIT"

		import logging
		import numpy as np
		from deepchem.utils.save import log

deepchem/feat/base_classes.py

+86 −12

Original line number	Diff line number	Diff line
		@@ -6,9 +6,7 @@ import types
		import numpy as np
		import multiprocessing

		__author__ = "Steven Kearnes"
		__copyright__ = "Copyright 2014, Stanford University"
		__license__ = "BSD 3-clause"
		logger = logging.getLogger(__name__)


		def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message):
		@@ -16,6 +14,53 @@ def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message):
		return featurizer._featurize_complex(mol_pdb_file, protein_pdb_file)


		class Featurizer(object):
		"""Abstract class for calculating a set of features for a datapoint.

		This class is abstract and cannot be invoked directly. You'll
		likely only interact with this class if you're a developer. In
		that case, you might want to make a child class which
		implements the `_featurize` method for calculating features for
		a single datapoints if you'd like to make a featurizer for a
		new datatype.
		"""

		def featurize(self, datapoints, log_every_n=1000):
		"""Calculate features for datapoints.

		Parameters
		----------
		datapoints: object
		Any blob of data you like. Subclasss should instantiate this.

		Returns
		-------
		A numpy array containing a featurized representation of
		`datapoints`.
		"""
		datapoints = list(datapoints)
		features = []
		for i, point in enumerate(datapoints):
		if point is not None:
		features.append(self._featurize(point))
		else:
		features.append(np.array([]))

		features = np.asarray(features)
		return features

		def __call__(self, datapoints):
		"""Calculate features for datapoints.

		Parameters
		----------
		datapoints: object
		Any blob of data you like. Subclasss should instantiate
		this.
		"""
		return self.featurize(datapoints)


		class ComplexFeaturizer(object):
		""""
		Abstract class for calculating features for mol/protein complexes.
		@@ -73,27 +118,56 @@ class ComplexFeaturizer(object):
		raise NotImplementedError('Featurizer is not defined.')


		class Featurizer(object):
		"""
		Abstract class for calculating a set of features for a molecule.
		class MolecularFeaturizer(object):
		"""Abstract class for calculating a set of features for a
		molecule.

		The defining feature of a `MolecularFeaturizer` is that it
		uses SMILES strings and RDKIT molecule objecgs to represent
		small molecules. All other featurizers which are subclasses of
		this class should plan to process input which comes as smiles
		strings or RDKIT molecules.

		Child classes implement the _featurize method for calculating features
		for a single molecule.
		Child classes need to implement the _featurize method for
		calculating features for a single molecule.

		Note
		----
		In general, subclasses of this class will require RDKit to be installed.
		"""

		def featurize(self, mols, verbose=True, log_every_n=1000):
		"""
		Calculate features for molecules.
		"""Calculate features for molecules.

		Parameters
		----------
		mols : iterable
		RDKit Mol objects.
		RDKit Mol, or SMILES string, or filename for
		mol2/sdf/pdb/pdbqt file.

		Returns
		-------
		A numpy array containing a featurized representation of
		`datapoints`.
		"""
		try:
		from rdkit import Chem
		from rdkit.Chem.rdchem import Mol
		except ModuleNotFoundError:
		raise ValueError("This class requires RDKit to be installed.")
		# Special case handling of single molecule
		if isinstance(mols, str) or isinstance(mols, Mol):
		mols = [mols]
		else:
		# Convert iterables to list
		mols = list(mols)
		features = []
		for i, mol in enumerate(mols):
		if mol is not None:
		# Process only case of SMILES strings.
		if isinstance(mol, str):
		# mol must be a SMILES string so parse
		mol = Chem.MolFromSmiles(mol)
		features.append(self._featurize(mol))
		else:
		features.append(np.array([]))

deepchem/feat/basic.py

+28 −12

Original line number	Diff line number	Diff line
		"""
		Basic molecular features.
		"""
		__author__ = "Steven Kearnes"
		__copyright__ = "Copyright 2014, Stanford University"
		__license__ = "MIT"

		from deepchem.feat import Featurizer
		from deepchem.feat.base_classes import MolecularFeaturizer


		class MolecularWeight(Featurizer):
		"""
		Molecular weight.
		class MolecularWeight(MolecularFeaturizer):
		"""Molecular weight.

		Note
		----
		This class requires RDKit to be installed.
		"""
		name = ['mw', 'molecular_weight']

		@@ -23,18 +23,26 @@ class MolecularWeight(Featurizer):
		mol : RDKit Mol
		Molecule.
		"""
		try:
		from rdkit.Chem import Descriptors
		except ModuleNotFoundError:
		raise ValueError("This class requires RDKit to be installed.")
		wt = Descriptors.ExactMolWt(mol)
		wt = [wt]
		return wt


		class RDKitDescriptors(Featurizer):
		"""
		RDKit descriptors.
		class RDKitDescriptors(MolecularFeaturizer):
		"""RDKit descriptors.

		This class comptues a list of chemical descriptors using RDKit.

		See http://rdkit.org/docs/GettingStartedInPython.html
		#list-of-available-descriptors.

		Note
		----
		This class requires RDKit to be installed.
		"""
		name = 'descriptors'

		@@ -69,9 +77,12 @@ class RDKitDescriptors(Featurizer):
		])

		def __init__(self):
		try:
		from rdkit.Chem import Descriptors
		except ModuleNotFoundError:
		raise ValueError("This class requires RDKit to be installed.")
		self.descriptors = []
		self.descList = []
		from rdkit.Chem import Descriptors
		for descriptor, function in Descriptors.descList:
		if descriptor in self.allowedDescriptors:
		self.descriptors.append(descriptor)
		@@ -85,6 +96,11 @@ class RDKitDescriptors(Featurizer):
		----------
		mol : RDKit Mol
		Molecule.

		Returns
		-------
		rval: np.ndarray
		Vector of RDKit descriptors for `mol`
		"""
		rval = []
		for desc_name, function in self.descList:

deepchem/feat/coulomb_matrices.py

+48 −18

Original line number	Diff line number	Diff line
		@@ -3,21 +3,27 @@ Generate coulomb matrices for molecules.

		See Montavon et al., _New Journal of Physics_ __15__ (2013) 095003.
		"""
		__author__ = "Steven Kearnes"
		__copyright__ = "Copyright 2014, Stanford University"
		__license__ = "MIT"

		import numpy as np
		import deepchem as dc
		from deepchem.feat import Featurizer
		from deepchem.feat.base_classes import MolecularFeaturizer
		from deepchem.utils import pad_array
		from deepchem.feat.atomic_coordinates import AtomicCoordinates


		class BPSymmetryFunctionInput(Featurizer):
		"""
		Calculate Symmetry Function for each atom in the molecules
		Methods described in https://journals.aps.org/prl/pdf/10.1103/PhysRevLett.98.146401
		class BPSymmetryFunctionInput(MolecularFeaturizer):
		"""Calculate Symmetry Function for each atom in the molecules

		This method is described in [1]_

		References
		----------
		.. [1] Behler, Jörg, and Michele Parrinello. "Generalized neural-network
		representation of high-dimensional potential-energy surfaces." Physical
		review letters 98.14 (2007): 146401.

		Note
		----
		This class requires RDKit to be installed.
		"""

		def __init__(self, max_atoms):
		@@ -34,9 +40,11 @@ class BPSymmetryFunctionInput(Featurizer):
		return np.pad(features, ((0, self.max_atoms - n_atoms), (0, 0)), 'constant')


		class CoulombMatrix(Featurizer):
		"""
		Calculate Coulomb matrices for molecules.
		class CoulombMatrix(MolecularFeaturizer):
		"""Calculate Coulomb matrices for molecules.

		Coulomb matrices provide a representation of the electronic structure of a
		molecule. This method is described in [1]_.

		Parameters
		----------
		@@ -55,14 +63,24 @@ class CoulombMatrix(Featurizer):
		seed : int, optional
		Random seed.

		Example:

		Example
		-------
		>>> featurizers = dc.feat.CoulombMatrix(max_atoms=23)
		>>> input_file = 'deepchem/feat/tests/data/water.sdf' # really backed by water.sdf.csv
		>>> tasks = ["atomization_energy"]
		>>> loader = dc.data.SDFLoader(tasks, featurizer=featurizers)
		>>> dataset = loader.create_dataset(input_file) #doctest: +ELLIPSIS
		Reading structures from deepchem/feat/tests/data/water.sdf.

		References
		----------
		.. [1] Montavon, Grégoire, et al. "Learning invariant representations of
		molecules for atomization energy prediction." Advances in neural information
		processing systems. 2012.

		Note
		----
		This class requires RDKit to be installed.
		"""
		conformers = True
		name = 'coulomb_matrix'
		@@ -74,6 +92,10 @@ class CoulombMatrix(Featurizer):
		upper_tri=False,
		n_samples=1,
		seed=None):
		try:
		from rdkit import Chem
		except ModuleNotFoundError:
		raise ValueError("This class requires RDKit to be installed.")
		self.max_atoms = int(max_atoms)
		self.remove_hydrogens = remove_hydrogens
		self.randomize = randomize
		@@ -196,8 +218,10 @@ class CoulombMatrix(Featurizer):


		class CoulombMatrixEig(CoulombMatrix):
		"""
		Calculate the eigenvales of Coulomb matrices for molecules.
		"""Calculate the eigenvalues of Coulomb matrices for molecules.

		This featurizer computes the eigenvalues of the Coulomb matrices for provided
		molecules. Coulomb matrices are described in [1]_.

		Parameters
		----------
		@@ -214,14 +238,20 @@ class CoulombMatrixEig(CoulombMatrix):
		seed : int, optional
		Random seed.

		Example:

		Example
		-------
		>>> featurizers = dc.feat.CoulombMatrixEig(max_atoms=23)
		>>> input_file = 'deepchem/feat/tests/data/water.sdf' # really backed by water.sdf.csv
		>>> tasks = ["atomization_energy"]
		>>> loader = dc.data.SDFLoader(tasks, featurizer=featurizers)
		>>> dataset = loader.create_dataset(input_file) #doctest: +ELLIPSIS
		Reading structures from deepchem/feat/tests/data/water.sdf.

		References
		----------
		.. [1] Montavon, Grégoire, et al. "Learning invariant representations of
		molecules for atomization energy prediction." Advances in neural information
		processing systems. 2012.
		"""

		conformers = True

Admin message