Merge branch 'master' of https://github.com/deepchem/deepchem into tutorial_ProteinDL (443540b7) · Commits · 钟慕尧 / deepchem

README.md

+1 −0

Original line number	Diff line number	Diff line
		@@ -40,6 +40,7 @@ DeepChem currently supports Python 3.6 through 3.7 and requires these packages o
		- [TensorFlow](https://www.tensorflow.org/)
		- `deepchem>=2.4.0` depends on TensorFlow v2
		- `deepchem<2.4.0` depends on TensorFlow v1
		- [Tensorflow Addons](https://www.tensorflow.org/addons) for Tensorflow v2 if you want to use advanced optimizers such as AdamW and Sparse Adam. (Optional)

		### Soft Requirements

deepchem/feat/init.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -25,6 +25,7 @@ from deepchem.feat.molecule_featurizers import MACCSKeysFingerprint
		from deepchem.feat.molecule_featurizers import MordredDescriptors
		from deepchem.feat.molecule_featurizers import Mol2VecFingerprint
		from deepchem.feat.molecule_featurizers import MolGraphConvFeaturizer
		from deepchem.feat.molecule_featurizers import PagtnMolGraphFeaturizer
		from deepchem.feat.molecule_featurizers import MolGanFeaturizer
		from deepchem.feat.molecule_featurizers import OneHotFeaturizer
		from deepchem.feat.molecule_featurizers import PubChemFingerprint

deepchem/feat/molecule_featurizers/init.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -15,4 +15,5 @@ from deepchem.feat.molecule_featurizers.rdkit_descriptors import RDKitDescriptor
		from deepchem.feat.molecule_featurizers.smiles_to_image import SmilesToImage
		from deepchem.feat.molecule_featurizers.smiles_to_seq import SmilesToSeq, create_char_to_idx
		from deepchem.feat.molecule_featurizers.mol_graph_conv_featurizer import MolGraphConvFeaturizer
		from deepchem.feat.molecule_featurizers.mol_graph_conv_featurizer import PagtnMolGraphFeaturizer
		from deepchem.feat.molecule_featurizers.molgan_featurizer import MolGanFeaturizer

deepchem/feat/molecule_featurizers/mol2vec_fingerprint.py

+39 −2

Original line number	Diff line number	Diff line
		@@ -58,13 +58,12 @@ class Mol2VecFingerprint(MolecularFeaturizer):
		"""
		try:
		from gensim.models import word2vec
		from mol2vec.features import mol2alt_sentence, sentences2vec
		from mol2vec.features import mol2alt_sentence
		except ModuleNotFoundError:
		raise ImportError("This class requires mol2vec to be installed.")

		self.radius = radius
		self.unseen = unseen
		self.sentences2vec = sentences2vec
		self.mol2alt_sentence = mol2alt_sentence
		if pretrain_model_path is None:
		data_dir = get_data_dir()
		@@ -78,6 +77,44 @@ class Mol2VecFingerprint(MolecularFeaturizer):
		# load pretrained models
		self.model = word2vec.Word2Vec.load(pretrain_model_path)

		def sentences2vec(self, sentences: list, model, unseen=None) -> np.ndarray:
		"""Generate vectors for each sentence (list) in a list of sentences. Vector is simply a
		sum of vectors for individual words.

		Parameters
		----------
		sentences : list, array
		List with sentences
		model : word2vec.Word2Vec
		Gensim word2vec model
		unseen : None, str
		Keyword for unseen words. If None, those words are skipped.
		https://stats.stackexchange.com/questions/163005/how-to-set-the-dictionary-for-text-analysis-using-neural-networks/163032#163032
		Returns
		-------
		np.array
		"""
		keys = set(model.wv.key_to_index.keys())
		vec = []
		if unseen:
		unseen_vec = model.wv.get_vector(unseen)

		for sentence in sentences:
		if unseen:
		vec.append(
		sum([
		model.wv.get_vector(y)
		if y in set(sentence) & keys else unseen_vec for y in sentence
		]))
		else:
		vec.append(
		sum([
		model.wv.get_vector(y)
		for y in sentence
		if y in set(sentence) & keys
		]))
		return np.array(vec)

		def _featurize(self, mol: RDKitMol) -> np.ndarray:
		"""
		Calculate Mordred descriptors.

deepchem/feat/molecule_featurizers/mol_graph_conv_featurizer.py

+222 −0

Original line number	Diff line number	Diff line
		@@ -4,6 +4,7 @@ import numpy as np
		from deepchem.utils.typing import RDKitAtom, RDKitBond, RDKitMol
		from deepchem.feat.graph_data import GraphData
		from deepchem.feat.base_classes import MolecularFeaturizer
		from deepchem.utils.molecule_feature_utils import one_hot_encode
		from deepchem.utils.molecule_feature_utils import get_atom_type_one_hot
		from deepchem.utils.molecule_feature_utils import construct_hydrogen_bonding_info
		from deepchem.utils.molecule_feature_utils import get_atom_hydrogen_bonding_one_hot
		@@ -18,6 +19,11 @@ from deepchem.utils.molecule_feature_utils import get_bond_type_one_hot
		from deepchem.utils.molecule_feature_utils import get_bond_is_in_same_ring_one_hot
		from deepchem.utils.molecule_feature_utils import get_bond_is_conjugated_one_hot
		from deepchem.utils.molecule_feature_utils import get_bond_stereo_one_hot
		from deepchem.utils.molecule_feature_utils import get_atom_formal_charge_one_hot
		from deepchem.utils.molecule_feature_utils import get_atom_implicit_valence_one_hot
		from deepchem.utils.molecule_feature_utils import get_atom_explicit_valence_one_hot
		from deepchem.utils.rdkit_utils import compute_all_pairs_shortest_path
		from deepchem.utils.rdkit_utils import compute_pairwise_ring_info


		def _construct_atom_feature(
		@@ -217,3 +223,219 @@ class MolGraphConvFeaturizer(MolecularFeaturizer):
		node_features=atom_features,
		edge_index=np.asarray([src, dest], dtype=int),
		edge_features=bond_features)


		class PagtnMolGraphFeaturizer(MolecularFeaturizer):
		"""This class is a featuriser of PAGTN graph networks for molecules.

		The featurization is based on `PAGTN model <https://arxiv.org/abs/1905.12712>`_. It is
		slightly more computationally intensive than default Graph Convolution Featuriser, but it
		builds a Molecular Graph connecting all atom pairs accounting for interactions of an atom with
		every other atom in the Molecule. According to the paper, interactions between two pairs
		of atom are dependent on the relative distance between them and and hence, the function needs
		to calculate the shortest path between them.

		The default node representation is constructed by concatenating the following values,
		and the feature length is 94.

		- Atom type: One hot encoding of the atom type. It consists of the most possible elements in a chemical compound.
		- Formal charge: One hot encoding of formal charge of the atom.
		- Degree: One hot encoding of the atom degree
		- Explicit Valence: One hot encoding of explicit valence of an atom. The supported possibilities
		include ``0 - 6``.
		- Implicit Valence: One hot encoding of implicit valence of an atom. The supported possibilities
		include ``0 - 5``.
		- Aromaticity: Boolean representing if an atom is aromatic.

		The default edge representation is constructed by concatenating the following values,
		and the feature length is 42. It builds a complete graph where each node is connected to
		every other node. The edge representations are calculated based on the shortest path between two nodes
		(choose any one if multiple exist). Each bond encountered in the shortest path is used to
		calculate edge features.

		- Bond type: A one-hot vector of the bond type, "single", "double", "triple", or "aromatic".
		- Conjugated: A one-hot vector of whether this bond is conjugated or not.
		- Same ring: A one-hot vector of whether the atoms in the pair are in the same ring.
		- Ring Size and Aromaticity: One hot encoding of atoms in pair based on ring size and aromaticity.
		- Distance: One hot encoding of the distance between pair of atoms.

		Examples
		--------
		>>> from deepchem.feat import PagtnMolGraphFeaturizer
		>>> smiles = ["C1CCC1", "C1=CC=CN=C1"]
		>>> featurizer = PagtnMolGraphFeaturizer(max_length=5)
		>>> out = featurizer.featurize(smiles)
		>>> type(out[0])
		<class 'deepchem.feat.graph_data.GraphData'>
		>>> out[0].num_node_features
		94
		>>> out[0].num_edge_features
		42

		References
		----------
		.. [1] Chen, Barzilay, Jaakkola "Path-Augmented Graph Transformer Network"
		10.26434/chemrxiv.8214422.

		Note
		----
		This class requires RDKit to be installed.

		"""

		def __init__(self, max_length=5):
		"""
		Parameters
		----------
		max_length : int
		Maximum distance up to which shortest paths must be considered.
		Paths shorter than max_length will be padded and longer will be
		truncated, default to ``5``.
		"""

		self.SYMBOLS = [
		'C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe',
		'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd',
		'Co', 'Se', 'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In',
		'Mn', 'Zr', 'Cr', 'Pt', 'Hg', 'Pb', 'W', 'Ru', 'Nb', 'Re', 'Te', 'Rh',
		'Tc', 'Ba', 'Bi', 'Hf', 'Mo', 'U', 'Sm', 'Os', 'Ir', 'Ce', 'Gd', 'Ga',
		'Cs', '*', 'UNK'
		]

		self.RING_TYPES = [(5, False), (5, True), (6, False), (6, True)]
		self.ordered_pair = lambda a, b: (a, b) if a < b else (b, a)
		self.max_length = max_length

		def _pagtn_atom_featurizer(self, atom: RDKitAtom) -> np.ndarray:
		"""Calculate Atom features from RDKit atom object.

		Parameters
		----------
		mol: rdkit.Chem.rdchem.Mol
		RDKit mol object.

		Returns
		-------
		atom_feat: np.ndarray
		numpy vector of atom features.
		"""
		atom_type = get_atom_type_one_hot(atom, self.SYMBOLS, False)
		formal_charge = get_atom_formal_charge_one_hot(
		atom, include_unknown_set=False)
		degree = get_atom_total_degree_one_hot(atom, list(range(11)), False)
		exp_valence = get_atom_explicit_valence_one_hot(atom, list(range(7)), False)
		imp_valence = get_atom_implicit_valence_one_hot(atom, list(range(6)), False)
		armoticity = get_atom_is_in_aromatic_one_hot(atom)
		atom_feat = np.concatenate([
		atom_type, formal_charge, degree, exp_valence, imp_valence, armoticity
		])
		return atom_feat

		def _edge_features(self, mol: RDKitMol, path_atoms: Tuple[int, ...],
		ring_info) -> np.ndarray:
		"""Computes the edge features for a given pair of nodes.

		Parameters
		----------
		mol : : RDKitMol
		RDKit molecule instance.
		path_atoms: tuple
		Shortest path between the given pair of nodes.
		ring_info: list
		Different rings that contain the pair of atoms
		"""
		features = []
		path_bonds = []
		path_length = len(path_atoms)
		for path_idx in range(path_length - 1):
		bond = mol.GetBondBetweenAtoms(path_atoms[path_idx],
		path_atoms[path_idx + 1])
		if bond is None:
		import warnings
		warnings.warn('Valid idx of bonds must be passed')
		path_bonds.append(bond)

		for path_idx in range(self.max_length):
		if path_idx < len(path_bonds):
		bond_type = get_bond_type_one_hot(path_bonds[path_idx])
		conjugacy = get_bond_is_conjugated_one_hot(path_bonds[path_idx])
		ring_attach = get_bond_is_in_same_ring_one_hot(path_bonds[path_idx])
		features.append(np.concatenate([bond_type, conjugacy, ring_attach]))
		else:
		features.append(np.zeros(6))

		if path_length + 1 > self.max_length:
		path_length = self.max_length + 1
		position_feature = np.zeros(self.max_length + 2)
		position_feature[path_length] = 1
		features.append(position_feature)
		if ring_info:
		rfeat = [
		one_hot_encode(r, allowable_set=self.RING_TYPES) for r in ring_info
		]
		# The 1.0 float value represents True Boolean
		rfeat = [1.0] + np.any(rfeat, axis=0).tolist()
		features.append(rfeat)
		else:
		# This will return a boolean vector with all entries False
		features.append([0.0] +
		one_hot_encode(ring_info, allowable_set=self.RING_TYPES))
		return np.concatenate(features, axis=0)

		def _pagtn_edge_featurizer(self,
		mol: RDKitMol) -> Tuple[np.ndarray, np.ndarray]:
		"""Calculate bond features from RDKit mol object.

		Parameters
		----------
		mol: rdkit.Chem.rdchem.Mol
		RDKit mol object.

		Returns
		-------
		np.ndarray
		Source and Destination node indexes of each bond.
		np.ndarray
		numpy vector of bond features.
		"""
		n_atoms = mol.GetNumAtoms()
		# To get the shortest paths between two nodes.
		paths_dict = compute_all_pairs_shortest_path(mol)
		# To get info if two nodes belong to the same ring.
		rings_dict = compute_pairwise_ring_info(mol)
		# Featurizer
		feats = []
		src = []
		dest = []
		for i in range(n_atoms):
		for j in range(n_atoms):
		src.append(i)
		dest.append(j)

		if (i, j) not in paths_dict:
		feats.append(np.zeros(7 * self.max_length + 7))
		continue
		ring_info = rings_dict.get(self.ordered_pair(i, j), [])
		feats.append(self._edge_features(mol, paths_dict[(i, j)], ring_info))

		return np.array([src, dest], dtype=np.int), np.array(feats, dtype=np.float)

		def _featurize(self, mol: RDKitMol) -> GraphData:
		"""Calculate molecule graph features from RDKit mol object.

		Parameters
		----------
		mol: rdkit.Chem.rdchem.Mol
		RDKit mol object.

		Returns
		-------
		graph: GraphData
		A molecule graph with some features.
		"""
		node_features = np.asarray(
		[self._pagtn_atom_featurizer(atom) for atom in mol.GetAtoms()],
		dtype=np.float)
		edge_index, edge_features = self._pagtn_edge_featurizer(mol)
		graph = GraphData(node_features, edge_index, edge_features)
		return graph

Admin message