Merge pull request #1618 from VIGS25/chemnet-featurizers (4e382ee8) · Commits · 钟慕尧 / deepchem

deepchem/feat/init.py

+2 −1

Original line number	Diff line number	Diff line
		@@ -27,3 +27,4 @@ from deepchem.feat.raw_featurizer import RawFeaturizer
		from deepchem.feat.atomic_coordinates import AtomicCoordinates
		from deepchem.feat.atomic_coordinates import NeighborListComplexAtomicCoordinates
		from deepchem.feat.adjacency_fingerprints import AdjacencyFingerprint
		from deepchem.feat.smiles_featurizers import SmilesToSeq, SmilesToImage

deepchem/feat/smiles_featurizers.py

0 → 100644

+256 −0

Original line number	Diff line number	Diff line
		"""
		Featurizer implementations used in ChemCeption and Smiles2Vec models.
		SmilesToSeq featurizer for Smiles2Vec models taken from https://arxiv.org/abs/1712.02734
		SmilesToImage featurizer for ChemCeption models taken from https://arxiv.org/abs/1710.02238
		"""

		from __future__ import division
		from __future__ import unicode_literals

		__author__ = "Vignesh Ram Somnath"
		__license__ = "MIT"

		import numpy as np
		import pandas as pd
		from deepchem.feat import Featurizer

		PAD_TOKEN = "<pad>"
		OUT_OF_VOCAB_TOKEN = "<unk>"


		def create_char_to_idx(filename,
		max_len=250,
		smiles_field="smiles",
		verbose=False):
		"""Creates a dictionary with character to index mapping.

		Parameters
		----------
		filename: str,
		Name of the file containing the SMILES strings
		max_len: int, default 250
		Maximum allowed length of the SMILES string
		smiles_field: str, default smiles
		Field indicating the SMILES strings int the file.
		verbose: bool, default True
		Whether to print the progress
		"""
		smiles_df = pd.read_csv(filename)
		char_set = set()
		for smile in smiles_df[smiles_field]:
		if len(smile) <= max_len:
		char_set.update(set(smile))

		unique_char_list = list(char_set)
		unique_char_list += [PAD_TOKEN, OUT_OF_VOCAB_TOKEN]
		if verbose:
		print("Number of unique characters: ", len(unique_char_list))

		char_to_idx = {letter: idx for idx, letter in enumerate(unique_char_list)}

		if verbose:
		print(unique_char_list)
		return char_to_idx


		class SmilesToSeq(Featurizer):
		"""
		SmilesToSeq Featurizer takes a SMILES string, and turns it into a sequence.
		Details taken from https://arxiv.org/abs/1712.02734.

		SMILES strings smaller than a specified max length (max_len) are padded using
		the PAD token while those larger than the max length are not considered. Based
		on the paper, there is also the option to add extra padding (pad_len) on both
		sides of the string after length normalization. Using a character to index (char_to_idx)
		mapping, the SMILES characters are turned into indices and the
		resulting sequence of indices serves as the input for an embedding layer.

		"""

		def __init__(self, char_to_idx, max_len=250, pad_len=10, **kwargs):
		"""
		Parameters
		----------
		char_to_idx: dict
		Dictionary containing character to index mappings for unique characters
		max_len: int, default 250
		Maximum allowed length of the SMILES string
		pad_len: int, default 10
		Amount of padding to add on either side of the SMILES seq
		"""
		self.max_len = max_len
		self.char_to_idx = char_to_idx
		self.idx_to_char = {idx: letter for letter, idx in self.char_to_idx.items()}
		self.pad_len = pad_len
		super(SmilesToSeq, self).__init__(**kwargs)

		def to_seq(self, smile):
		"""Turns list of smiles characters into array of indices"""
		out_of_vocab_idx = self.char_to_idx[OUT_OF_VOCAB_TOKEN]
		seq = [
		self.char_to_idx.get(character, out_of_vocab_idx) for character in smile
		]
		return np.array(seq)

		def remove_pad(self, characters):
		"""Removes PAD_TOKEN from the character list."""
		characters = characters[self.pad_len:]
		characters = characters[:-self.pad_len]
		chars = list()

		for char in characters:
		if char != PAD_TOKEN:
		chars.append(char)
		return chars

		def smiles_from_seq(self, seq):
		"""Reconstructs SMILES string from sequence."""
		characters = [self.idx_to_char[i] for i in seq]

		characters = self.remove_pad(characters)
		smile = "".join([letter for letter in characters])
		return smile

		def _featurize(self, mol):
		"""Featurizes a SMILES sequence."""
		from rdkit import Chem
		smile = Chem.MolToSmiles(mol)
		if len(smile) > self.max_len:
		return list()

		smile_list = list(smile)
		# Extend shorter strings with padding
		if len(smile) < self.max_len:
		smile_list.extend([PAD_TOKEN] * (self.max_len - len(smile)))

		# Padding before and after
		smile_list += [PAD_TOKEN] * self.pad_len
		smile_list = [PAD_TOKEN] * self.pad_len + smile_list

		smile_seq = self.to_seq(smile_list)
		return smile_seq


		class SmilesToImage(Featurizer):
		"""
		SmilesToImage Featurizer takes a SMILES string, and turns it into an image.
		Details taken from https://arxiv.org/abs/1712.02734.

		The default size of for the image is 80 x 80. Two image modes are currently
		supported - std & engd. std is the gray scale specification,
		with atomic numbers as pixel values for atom positions and a constant value of
		2 for bond positions. engd is a 4-channel specification, which uses atom
		properties like hybridization, valency, charges in addition to atomic number.
		Bond type is also used for the bonds.

		The coordinates of all atoms are computed, and lines are drawn between atoms
		to indicate bonds. For the respective channels, the atom and bond positions are
		set to the property values as mentioned in the paper.
		"""

		def __init__(self,
		img_size=80,
		res=0.5,
		max_len=250,
		img_spec="std",
		**kwargs):
		"""
		Parameters
		----------
		img_size: int, default 80
		Size of the image tensor
		res: float, default 0.5
		Displays the resolution of each pixel in Angstrom
		max_len: int, default 250
		Maximum allowed length of SMILES string
		img_spec: str, default std
		Indicates the channel organization of the image tensor
		"""
		if img_spec not in ["std", "engd"]:
		raise ValueError(
		"Image mode must be one of std or engd. {} is not supported".format(
		img_spec))
		self.img_size = img_size
		self.max_len = max_len
		self.res = res
		self.img_spec = img_spec
		self.embed = int(dims * res / 2)
		super(SmilesToImage, self).__init__(**kwargs)

		def _featurize(self, mol):
		"""Featurizes a single SMILE sequence."""
		from rdkit import Chem
		from rdkit.Chem import AllChem

		smile = Chem.MolToSmiles(mol)
		if len(smile) > self.max_len:
		return list()

		cmol = Chem.Mol(mol.ToBinary())
		cmol.ComputeGasteigerCharges()
		AllChem.Compute2DCoords(cmol)
		atom_coords = cmol.GetConformer(0).GetPositions()

		if self.img_spec == "std":
		# Setup image
		img = np.zeros((self.img_size, self.img_size, 1))
		# Compute bond properties
		bond_props = np.array(
		[[2.0, bond.GetBeginAtomIdx(),
		bond.GetEndAtomIdx()] for bond in mol.GetBonds()])
		# Compute atom properties
		atom_props = np.array([[atom.GetAtomicNum()] for atom in cmol.GetAtoms()])

		else:
		# Setup image
		img = np.zeros((self.img_size, self.img_size, 4))
		# Compute bond properties
		bond_props = np.array([[
		bond.GetBondTypeAsDouble(),
		bond.GetBeginAtomIdx(),
		bond.GetEndAtomIdx()
		] for bond in mol.GetBonds()])
		# Compute atom properties
		atom_props = np.array([[
		atom.GetAtomicNum(),
		atom.GetProp("_GasteigerCharge"),
		atom.GetExplicitValence(),
		atom.GetHybridization().real,
		] for atom in cmol.GetAtoms()])

		frac = np.linspace(0, 1, int(1 / self.res * 2))
		# Reshape done for proper broadcast
		frac = frac.reshape(-1, 1, 1)

		try:
		bond_begin_idxs = bond_props[:, 1].astype(int)
		bond_end_idxs = bond_props[:, 2].astype(int)

		# Reshapes, and axes manipulations to facilitate vector processing.
		begin_coords = atom_coords[bond_begin_idxs]
		begin_coords = np.expand_dims(begin_coords.T, axis=0)
		end_coords = atom_coords[bond_end_idxs]
		end_coords = np.expand_dims(end_coords.T, axis=0)

		# Draw a line between the two atoms.
		# The coordinates of this line, are indicated in line_coords
		line_coords = frac * begin_coords + (1 - frac) * end_coords
		# Turn the line coordinates into image positions
		bond_line_idxs = np.ceil(
		(line_coords[:, 0] + self.embed) / self.res).astype(int)
		bond_line_idys = np.ceil(
		(line_coords[:, 1] + self.embed) / self.res).astype(int)
		# Set the bond line coordinates to the bond property used.
		img[bond_line_idxs, bond_line_idys, 0] = bond_props[:, 0]

		# Turn atomic coordinates into image positions
		atom_idxs = np.round(
		(atom_coords[:, 0] + self.embed) / self.res).astype(int)
		atom_idys = np.round(
		(atom_coords[:, 1] + self.embed) / self.res).astype(int)
		# Set the atom positions in image to different atomic properties in channels
		img[atom_idxs, atom_idys, :] = atom_props
		return img

		except IndexError as e:
		return []

deepchem/feat/tests/data/chembl_25_small.csv

0 → 100644

+101 −0

File added.

Preview size limit exceeded, changes collapsed.

deepchem/feat/tests/test_smiles_featurizers.py

0 → 100644

+41 −0

Original line number	Diff line number	Diff line
		from unittest import TestCase
		import numpy as np
		from nose.tools import assert_equals
		from deepchem.feat import SmilesToSeq, SmilesToImage
		from deepchem.feat.smiles_featurizers import create_char_to_idx
		import os


		class TestSmilesFeaturizers(TestCase):
		"""Tests for SmilesToSeq and SmilesToImage featurizers."""

		def setUp(self):
		"""Setup."""
		pad_len = 5
		max_len = 35
		filename = os.path.join(
		os.path.dirname(__file__), "data", "chembl_25_small.csv")
		char_to_idx = create_char_to_idx(filename, max_len=max_len)
		self.feat = SmilesToSeq(
		char_to_idx=char_to_idx, max_len=max_len, pad_len=pad_len)

		def test_smiles_to_seq_featurize(self):
		"""Test SmilesToSeq featurization."""
		from rdkit import Chem
		smiles = ["Cn1c(=O)c2c(ncn2C)n(C)c1=O", "CC(=O)N1CN(C(C)=O)C(O)C1O"]
		mols = [Chem.MolFromSmiles(smile) for smile in smiles]
		expected_seq_len = self.feat.max_len + 2 * self.feat.pad_len

		features = self.feat.featurize(mols)
		assert_equals(features.shape[0], len(smiles))
		assert_equals(features.shape[-1], expected_seq_len)

		def test_reconstruct_from_seq(self):
		"""Test SMILES reconstruction from features."""
		smiles = ["Cn1c(=O)c2c(ncn2C)n(C)c1=O"]
		from rdkit import Chem
		mols = [Chem.MolFromSmiles(smile) for smile in smiles]
		features = self.feat.featurize(mols)

		reconstructed_smile = self.feat.smiles_from_seq(features[0])
		assert_equals(smiles[0], reconstructed_smile)

deepchem/molnet/init.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -33,6 +33,7 @@ from deepchem.molnet.load_function.factors_datasets import load_factors
		from deepchem.molnet.load_function.kinase_datasets import load_kinase
		from deepchem.molnet.load_function.thermosol_datasets import load_thermosol
		from deepchem.molnet.load_function.hppb_datasets import load_hppb
		from deepchem.molnet.load_function.chembl25_datasets import load_chembl25

		from deepchem.molnet.dnasim import simulate_motif_density_localization
		from deepchem.molnet.dnasim import simulate_motif_counting

Admin message