Changes (2beebfa2) · Commits · 钟慕尧 / deepchem

deepchem/feat/base_classes.py

+12 −6

Original line number	Diff line number	Diff line
		@@ -30,8 +30,10 @@ class Featurizer(object):

		Parameters
		----------
		datapoints: object
		Any blob of data you like. Subclasss should instantiate this.
		datapoints: iterable
		A sequence of objects that you'd like to featurize. Subclassses of
		`Featurizer` should instantiate the `_featurize` method that featurizes
		objects in the sequence.

		Returns
		-------
		@@ -40,6 +42,8 @@ class Featurizer(object):
		datapoints = list(datapoints)
		features = []
		for i, point in enumerate(datapoints):
		if i % log_every_n == 0:
		logger.info("Featurizing datapoint %i" % i)
		if point is not None:
		features.append(self._featurize(point))
		else:
		@@ -135,14 +139,14 @@ class MolecularFeaturizer(Featurizer):
		In general, subclasses of this class will require RDKit to be installed.
		"""

		def featurize(self, mols, verbose=True, log_every_n=1000):
		def featurize(self, mols, log_every_n=1000):
		"""Calculate features for molecules.

		Parameters
		----------
		mols : iterable
		RDKit Mol, or SMILES string, or filename for
		mol2/sdf/pdb/pdbqt file.
		mols : RDKit Mol / SMILES string /iterable
		RDKit Mol, or SMILES string or iterable sequence of RDKit mols/SMILES
		strings.

		Returns
		-------
		@@ -162,6 +166,8 @@ class MolecularFeaturizer(Featurizer):
		mols = list(mols)
		features = []
		for i, mol in enumerate(mols):
		if i % log_every_n == 0:
		logger.info("Featurizing datapoint %i" % i)
		if mol is not None:
		# Process only case of SMILES strings.
		if isinstance(mol, str):

deepchem/feat/basic.py

+3 −3

Original line number	Diff line number	Diff line
		@@ -46,8 +46,8 @@ class RDKitDescriptors(MolecularFeaturizer):

		Attributes
		----------
		descriptors: list
		List of RDKit descriptor names used in this class.
		descriptors: np.ndarray
		1D array of RDKit descriptor names used in this class.

		Note
		----
		@@ -108,7 +108,7 @@ class RDKitDescriptors(MolecularFeaturizer):
		Returns
		-------
		rval: np.ndarray
		Vector of RDKit descriptors for `mol`
		1D array of RDKit descriptors for `mol`
		"""
		rval = []
		for desc_name, function in self.descList:

deepchem/feat/coulomb_matrices.py

+47 −3

Original line number	Diff line number	Diff line
		@@ -27,6 +27,14 @@ class BPSymmetryFunctionInput(MolecularFeaturizer):
		"""

		def __init__(self, max_atoms):
		"""Initialize this featurizer.

		Parameters
		----------
		max_atoms: int
		The maximum number of atoms expected for molecules this featurizer will
		process.
		"""
		self.max_atoms = max_atoms

		def _featurize(self, mol):
		@@ -92,6 +100,24 @@ class CoulombMatrix(MolecularFeaturizer):
		upper_tri=False,
		n_samples=1,
		seed=None):
		"""Initialize this featurizer.

		Parameters
		----------
		max_atoms: int
		The maximum number of atoms expected for molecules this featurizer will
		process.
		remove_hydrogens: bool, optional (default False)
		If True, remove hydrogens before processing them.
		randomize: bool, optional (default False)
		If True, use method `randomize_coulomb_matrices` to randomize Coulomb matrices.
		upper_tri: bool, optional (default False)
		Generate only upper triangle part of Coulomb matrices.
		n_samples: int, optional (default 1)
		If `randomize` is set to True, the number of random samples to draw.
		seed: int, optional (default None)
		Random seed to use.
		"""
		try:
		from rdkit import Chem
		except ModuleNotFoundError:
		@@ -163,9 +189,7 @@ class CoulombMatrix(MolecularFeaturizer):
		return rval

		def randomize_coulomb_matrix(self, m):
		"""
		Randomize a Coulomb matrix as decribed in Montavon et al.,
		New Journal of Physics, 15, (2013), 095003:
		"""Randomize a Coulomb matrix as decribed in [1]_:

		1. Compute row norms for M in a vector row_norms.
		2. Sample a zero-mean unit-variance noise vector e with dimension
		@@ -181,6 +205,10 @@ class CoulombMatrix(MolecularFeaturizer):
		Number of random matrices to generate.
		seed : int, optional
		Random seed.

		References
		----------
		.. [1] Montavon et al., New Journal of Physics, 15, (2013), 095003
		"""
		rval = []
		row_norms = np.asarray([np.linalg.norm(row) for row in m], dtype=float)
		@@ -263,6 +291,22 @@ class CoulombMatrixEig(CoulombMatrix):
		randomize=False,
		n_samples=1,
		seed=None):
		"""Initialize this featurizer.

		Parameters
		----------
		max_atoms: int
		The maximum number of atoms expected for molecules this featurizer will
		process.
		remove_hydrogens: bool, optional (default False)
		If True, remove hydrogens before processing them.
		randomize: bool, optional (default False)
		If True, use method `randomize_coulomb_matrices` to randomize Coulomb matrices.
		n_samples: int, optional (default 1)
		If `randomize` is set to True, the number of random samples to draw.
		seed: int, optional (default None)
		Random seed to use.
		"""
		self.max_atoms = int(max_atoms)
		self.remove_hydrogens = remove_hydrogens
		self.randomize = randomize

deepchem/feat/graph_features.py

+211 −35

Original line number	Diff line number	Diff line
		import enum
		import numpy as np
		import deepchem as dc
		from deepchem.feat.base_classes import MolecularFeaturizer
		@@ -14,21 +15,73 @@ def _featurize_complex(featurizer, mol_pdb_file, protein_pdb_file, log_message):


		def one_of_k_encoding(x, allowable_set):
		"""Encodes elements of a provided set as integers.

		Parameters
		----------
		x: object
		Must be present in `allowable_set`.
		allowable_set: list
		List of allowable quantities.

		Example
		-------
		>>> import deepchem as dc
		>>> dc.feat.graph_features.one_of_k_encoding("a", ["a", "b", "c"])
		[True, False, False]

		Raises
		------
		`ValueError` if `x` is not in `allowable_set`.
		"""
		if x not in allowable_set:
		raise Exception("input {0} not in allowable set{1}:".format(
		raise ValueError("input {0} not in allowable set{1}:".format(
		x, allowable_set))
		return list(map(lambda s: x == s, allowable_set))


		def one_of_k_encoding_unk(x, allowable_set):
		"""Maps inputs not in the allowable set to the last element."""
		"""Maps inputs not in the allowable set to the last element.

		Unlike `one_of_k_encoding`, if `x` is not in `allowable_set`, this method
		pretends that `x` is the last element of `allowable_set`.

		Parameters
		----------
		x: object
		Must be present in `allowable_set`.
		allowable_set: list
		List of allowable quantities.

		Examples
		--------
		>>> dc.feat.graph_features.one_of_k_encoding_unk("s", ["a", "b", "c"])
		[False, False, True]
		"""
		if x not in allowable_set:
		x = allowable_set[-1]
		return list(map(lambda s: x == s, allowable_set))


		def get_intervals(l):
		"""For list of lists, gets the cumulative products of the lengths"""
		"""For list of lists, gets the cumulative products of the lengths

		Note that we add 1 to the lengths of all lists (to avoid an empty list
		propagating a 0).

		Parameters
		----------
		l: list of lists
		Returns the cumulative product of these lengths.

		Examples
		--------
		>>> dc.feat.graph_features.get_intervals([[1], [1, 2], [1, 2, 3]])
		[1, 3, 12]

		>>> dc.feat.graph_features.get_intervals([[1], [], [1, 2], [1, 2, 3]])
		>>> [1, 1, 3, 12]
		"""
		intervals = len(l) * [0]
		# Initalize with 1
		intervals[0] = 1
		@@ -39,36 +92,58 @@ def get_intervals(l):


		def safe_index(l, e):
		"""Gets the index of e in l, providing an index of len(l) if not found"""
		"""Gets the index of e in l, providing an index of len(l) if not found

		Parameters
		----------
		l: list
		List of values
		e: object
		Object to check whether `e` is in `l`

		Examples
		--------
		>>> dc.feat.graph_features.safe_index([1, 2, 3], 1)
		0
		>>> dc.feat.graph_features.safe_index([1, 2, 3], 7)
		3
		"""
		try:
		return l.index(e)
		except:
		return len(l)


		class GraphConvConstants(enum.Enum):
		"""Allowed Atom Types."""
		possible_atom_list = [
		'C', 'N', 'O', 'S', 'F', 'P', 'Cl', 'Mg', 'Na', 'Br', 'Fe', 'Ca', 'Cu',
		'Mc', 'Pd', 'Pb', 'K', 'I', 'Al', 'Ni', 'Mn'
		]
		"""Allowed Numbers of Hydrogens"""
		possible_numH_list = [0, 1, 2, 3, 4]
		"""Allowed Valences for Atoms"""
		possible_valence_list = [0, 1, 2, 3, 4, 5, 6]
		"""Allowed Formal Charges for Atoms"""
		possible_formal_charge_list = [-3, -2, -1, 0, 1, 2, 3]
		# To avoid importing rdkit, this is a placeholder list of the correct
		# length. These will be replaced with rdkit HybridizationType below
		"""This is a placeholder for documentation. These will be replaced with corresponding values of the rdkit HybridizationType"""
		possible_hybridization_list = ["SP", "SP2", "SP3", "SP3D", "SP3D2"]
		"""Allowed number of radical electrons."""
		possible_number_radical_e_list = [0, 1, 2]
		"""Allowed types of Chirality"""
		possible_chirality_list = ['R', 'S']

		"""The set of all values allowed."""
		reference_lists = [
		possible_atom_list, possible_numH_list, possible_valence_list,
		possible_formal_charge_list, possible_number_radical_e_list,
		possible_hybridization_list, possible_chirality_list
		]

		"""The number of different values that can be taken. See `get_intervals()`"""
		intervals = get_intervals(reference_lists)
		# We use E-Z notation for stereochemistry
		# https://en.wikipedia.org/wiki/E%E2%80%93Z_notation
		"""Possible stereochemistry. We use E-Z notation for stereochemistry
		https://en.wikipedia.org/wiki/E%E2%80%93Z_notation"""
		possible_bond_stereo = ["STEREONONE", "STEREOANY", "STEREOZ", "STEREOE"]
		"""Number of different bond types not counting stereochemistry."""
		bond_fdim_base = 6


		@@ -79,10 +154,39 @@ def get_feature_list(atom):
		----------
		atom: RDKit.rdchem.Atom
		Atom to get features for

		Examples
		--------
		>>> from rdkit import Chem
		>>> mol = Chem.MolFromSmiles("C")
		>>> atom = mol.GetAtoms()[0]
		>>> dc.feat.graph_features.get_feature_list(atom)
		[0, 4, 4, 3, 0, 2]

		Note
		----
		This method requires RDKit to be installed.

		Returns
		-------
		features: list
		List of length 6. The i-th value in this list provides the index of the
		atom in the corresponding feature value list. The 6 feature values lists
		for this function are `[GraphConvConstants.possible_atom_list,
		GraphConvConstants.possible_numH_list,
		GraphConvConstants.possible_valence_list,
		GraphConvConstants.possible_formal_charge_list,
		GraphConvConstants.possible_num_radical_e_list]`.
		"""
		possible_atom_list = GraphConvConstants.possible_atom_list
		possible_numH_list = GraphConvConstants.possible_numH_list
		possible_valence_list = GraphConvConstants.possible_valence_list
		possible_formal_charge_list = GraphConvConstants.possible_formal_charge_list
		possible_number_radical_e_list = GraphConvConstants.possible_number_radical_e_list
		possible_hybridization_list = GraphConvConstants.possible_hybridization_list
		# Replace the hybridization
		from rdkit import Chem
		global possible_hybridization_list
		#global possible_hybridization_list
		possible_hybridization_list = [
		Chem.rdchem.HybridizationType.SP, Chem.rdchem.HybridizationType.SP2,
		Chem.rdchem.HybridizationType.SP3, Chem.rdchem.HybridizationType.SP3D,
		@@ -101,7 +205,20 @@ def get_feature_list(atom):


		def features_to_id(features, intervals):
		"""Convert list of features into index using spacings provided in intervals"""
		"""Convert list of features into index using spacings provided in intervals

		Parameters
		----------
		features: list
		List of features as returned by `get_feature_list()`
		intervals: list
		List of intervals as returned by `get_intervals()`

		Returns
		-------
		id: int
		The index in a feature vector given by the given set of features.
		"""
		id = 0
		for k in range(len(intervals)):
		id += features[k] * intervals[k]
		@@ -112,6 +229,20 @@ def features_to_id(features, intervals):


		def id_to_features(id, intervals):
		"""Given an index in a feature vector, return the original set of features.

		Parameters
		----------
		id: int
		The index in a feature vector given by the given set of features.
		intervals: list
		List of intervals as returned by `get_intervals()`

		Returns
		-------
		features: list
		List of features as returned by `get_feature_list()`
		"""
		features = 6 * [0]

		# Correct for null
		@@ -133,6 +264,11 @@ def atom_to_id(atom):
		----------
		atom: RDKit.rdchem.Atom
		Atom to convert to ids.

		Returns
		-------
		id: int
		The index in a feature vector given by the given set of features.
		"""
		features = get_feature_list(atom)
		return features_to_id(features, intervals)
		@@ -154,6 +290,10 @@ def atom_features(atom,
		If true, model hydrogens explicitly
		use_chirality: bool, optional
		If true, use chirality information.

		Returns
		-------
		np.ndarray of per-atom features.
		"""
		if bool_id_feat:
		return np.array([atom_to_id(atom)])
		@@ -245,6 +385,12 @@ def bond_features(bond, use_chirality=False):
		Note
		----
		This method requires RDKit to be installed.

		Returns
		-------
		bond_feats: np.ndarray
		Array of bond features. This is a 1-D array of length 6 if `use_chirality`
		is `False` else of length 10 with chirality encoded.
		"""
		try:
		from rdkit import Chem
		@@ -278,16 +424,24 @@ def pair_features(mol, edge_list, canon_adj_list, bt_len=6,
		Molecule to compute features on.
		edge_list: list
		List of edges to consider
		canon_adj_list: list
		TODO
		bt_len: int, optional
		TODO
		graph_distance: bool, optional
		canon_adj_list: list of lists
		`canon_adj_list[i]` is a list of the atom indices that atom `i` shares a
		list. This list is symmetrical so if `j in canon_adj_list[i]` then `i in
		canon_adj_list[j]`.
		bt_len: int, optional (default 6)
		The number of different bond types to consider.
		graph_distance: bool, optional (default True)
		If true, use graph distance between molecules. Else use euclidean distance.

		Note
		----
		This method requires RDKit to be installed.

		Returns
		-------
		features: np.ndarray
		Of shape `(N, N, bt_len + max_distance + 1)`. This is the array of pairwise
		features for all atom pairs.
		"""
		if graph_distance:
		max_distance = 7
		@@ -326,6 +480,28 @@ def pair_features(mol, edge_list, canon_adj_list, bt_len=6,


		def find_distance(a1, num_atoms, canon_adj_list, max_distance=7):
		"""Computes distances from provided atom.

		Parameters
		----------
		a1: RDKit atom
		The source atom to compute distances from.
		num_atoms: int
		The total number of atoms.
		canon_adj_list: list of lists
		`canon_adj_list[i]` is a list of the atom indices that atom `i` shares a
		list. This list is symmetrical so if `j in canon_adj_list[i]` then `i in
		canon_adj_list[j]`.
		max_distance: int, optional (default 7)
		The max distance to search.

		Returns
		-------
		distances: np.ndarray
		Of shape `(num_atoms, max_distance)`. Provides a one-hot encoding of the
		distances. That is, `distances[i]` is a one-hot encoding of the distance
		from `a1` to atom `i`.
		"""
		distance = np.zeros((num_atoms, max_distance))
		radial = 0
		# atoms `radial` bonds away from `a1`

deepchem/feat/one_hot.py

+4 −4

Original line number	Diff line number	Diff line
		@@ -25,10 +25,10 @@ class OneHotFeaturizer(MolecularFeaturizer):

		Parameters
		----------
		charset: obj:`list` of obj:`str`
		Each string is length 1
		padlength: int
		length to pad the smile strings to
		charset: list of str, optional (default None)
		A list of strings, where each string is length 1.
		padlength: int, optional (default 120)
		length to pad the smile strings to.
		"""
		try:
		from rdkit import Chem

Admin message