Merge branch 'master' into update-data-2 (20220a90) · Commits · 钟慕尧 / deepchem

deepchem/data/datasets.py

+20 −6

Original line number	Diff line number	Diff line
		@@ -420,6 +420,19 @@ class Dataset(object):
		"""
		raise NotImplementedError()

		def select(self, indices: Sequence[int],
		select_dir: Optional[str] = None) -> "Dataset":
		"""Creates a new dataset from a selection of indices from self.

		Parameters
		----------
		indices: Sequence
		List of indices to select.
		select_dir: str, optional (default None)
		Path to new directory that the selected indices will be copied to.
		"""
		raise NotImplementedError()

		def get_statistics(self, X_stats: bool = True,
		y_stats: bool = True) -> Tuple[float, ...]:
		"""Compute and return statistics of this dataset.
		@@ -1868,13 +1881,13 @@ class DiskDataset(Dataset):
		tasks=tasks)

		@staticmethod
		def merge(datasets: Iterable["DiskDataset"],
		def merge(datasets: Iterable["Dataset"],
		merge_dir: Optional[str] = None) -> "DiskDataset":
		"""Merges provided datasets into a merged dataset.

		Parameters
		----------
		datasets: Iterable[DiskDataset]
		datasets: Iterable[Dataset]
		List of datasets to merge.
		merge_dir: str, optional (default None)
		The new directory path to store the merged DiskDataset.
		@@ -1897,7 +1910,7 @@ class DiskDataset(Dataset):
		tasks = []
		for dataset in datasets:
		try:
		tasks.append(dataset.tasks)
		tasks.append(dataset.tasks) # type: ignore
		except AttributeError:
		pass
		if tasks:
		@@ -2033,7 +2046,7 @@ class DiskDataset(Dataset):

		def shuffle_each_shard(self,
		shard_basenames: Optional[List[str]] = None) -> None:
		"""Shuffles elements within each shard of the datset.
		"""Shuffles elements within each shard of the dataset.

		Parameters
		----------
		@@ -2282,8 +2295,9 @@ class DiskDataset(Dataset):

		Returns
		-------
		DiskDataset
		A Dataset containing the selected samples
		Dataset
		A dataset containing the selected samples. The default dataset is `DiskDataset`.
		If `output_numpy_dataset` is True, the dataset is `NumpyDataset`.
		"""
		if output_numpy_dataset and (select_dir is not None or
		select_shard_size is not None):

deepchem/feat/init.py

+9 −0

Original line number	Diff line number	Diff line
		@@ -34,3 +34,12 @@ from deepchem.feat.molecule_featurizers import MolGraphConvFeaturizer
		from deepchem.feat.material_featurizers import ElementPropertyFingerprint
		from deepchem.feat.material_featurizers import SineCoulombMatrix
		from deepchem.feat.material_featurizers import CGCNNFeaturizer

		try:
		import transformers
		from transformers import BertTokenizer

		from deepchem.feat.smiles_tokenizer import SmilesTokenizer
		from deepchem.feat.smiles_tokenizer import BasicSmilesTokenizer
		except ModuleNotFoundError:
		pass

deepchem/feat/fingerprints.py

+2 −0

Original line number	Diff line number	Diff line
		@@ -2,6 +2,7 @@
		Topological fingerprints.
		"""
		from deepchem.feat.base_classes import MolecularFeaturizer
		import numpy as np


		class CircularFingerprint(MolecularFeaturizer):
		@@ -103,6 +104,7 @@ class CircularFingerprint(MolecularFeaturizer):
		useChirality=self.chiral,
		useBondTypes=self.bonds,
		useFeatures=self.features)
		fp = np.asarray(fp, dtype=np.float)
		return fp

		def __hash__(self):

deepchem/feat/graph_features.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -400,7 +400,7 @@ def bond_features(bond, use_chirality=False):
		]
		if use_chirality:
		bond_feats = bond_feats + one_of_k_encoding_unk(
		str(bond.GetStereo()), GraphConvCoonstants.possible_bond_stereo)
		str(bond.GetStereo()), GraphConvConstants.possible_bond_stereo)
		return np.array(bond_feats)

deepchem/feat/smiles_tokenizer.py

0 → 100644

+341 −0

Original line number	Diff line number	Diff line
		# Requriments - transformers, tokenizers
		# Right now, the Smiles Tokenizer uses an exiesting vocab file from rxnfp that is fairly comprehensive and from the USPTO dataset.
		# The vocab may be expanded in the near future

		import collections
		import logging
		import os
		import re
		import numpy as np
		import pkg_resources
		import typing
		from typing import List
		from transformers import BertTokenizer
		from logging import getLogger

		logger = getLogger(__name__)
		"""
		SMI_REGEX_PATTERN: str
		SMILES regex pattern for tokenization. Designed by Schwaller et. al.

		References

		.. [1] Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee
		ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction
		1572-1583 DOI: 10.1021/acscentsci.9b00576

		"""

		SMI_REGEX_PATTERN = r"""(\[[^\]]+]\|Br?\|Cl?\|N\|O\|S\|P\|F\|I\|b\|c\|n\|o\|s\|p\|$\|$\|\.\|=\|
		#\|-\|\+\|\\\|\/\|:\|~\|@\|\?\|>>?\|\*\|\$\|\%[0-9]{2}\|[0-9])"""

		# add vocab_file dict
		VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}


		def get_default_tokenizer():
		default_vocab_path = (pkg_resources.resource_filename("deepchem",
		"feat/tests/vocab.txt"))
		return SmilesTokenizer(default_vocab_path)


		class SmilesTokenizer(BertTokenizer):
		"""
		Creates the SmilesTokenizer class. The tokenizer heavily inherits from the BertTokenizer
		implementation found in Huggingface's transformers library. It runs a WordPiece tokenization
		algorithm over SMILES strings using the tokenisation SMILES regex developed by Schwaller et. al.

		Please see https://github.com/huggingface/transformers
		and https://github.com/rxn4chemistry/rxnfp for more details.

		Examples
		--------

		>>> from deepchem.feat.smiles_tokenizer import SmilesTokenizer

		>>> current_dir = os.path.dirname(os.path.realpath(__file__))
		>>> vocab_path = os.path.join(current_dir, 'tests/data', 'vocab.txt')

		>>> tokenizer = SmilesTokenizer(vocab_path)
		>>> print(tokenizer.encode("CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1"))
		[12, 16, 16, 16, 17, 16, 16, 18, 16, 19, 16, 17, 22, 19, 18, 33, 17, 16, 18, 23, 181, 17, 22, 19, 18, 17, 19, 16, 33, 20, 19, 55, 17, 16, 23, 18, 17, 33, 17, 19, 18, 35, 20, 19, 18, 16, 20, 22, 16, 16, 22, 16, 21, 23, 20, 23, 22, 16, 23, 22, 16, 21, 23, 18, 19, 16, 20, 22, 16, 16, 22, 16, 16, 22, 16, 20, 13]


		References
		----------
		.. [1] Schwaller, Philippe; Probst, Daniel; Vaucher, Alain C.; Nair, Vishnu H; Kreutter, David;
		Laino, Teodoro; et al. (2019): Mapping the Space of Chemical Reactions using Attention-Based Neural
		Networks. ChemRxiv. Preprint. https://doi.org/10.26434/chemrxiv.9897365.v3
		Notes
		----
		This class requires huggingface's transformers and tokenizers libraries to be installed.

		"""
		vocab_files_names = VOCAB_FILES_NAMES

		def __init__(
		self,
		vocab_file: str = '',
		# unk_token="[UNK]",
		# sep_token="[SEP]",
		# pad_token="[PAD]",
		# cls_token="[CLS]",
		# mask_token="[MASK]",
		**kwargs):
		"""Constructs a SmilesTokenizer.

		Parameters
		----------
		vocab_file: str
		Path to a SMILES character per line vocabulary file.
		Default vocab file is found in deepchem/feat/tests/data/vocab.txt
		"""

		super().__init__(vocab_file, **kwargs)
		# take into account special tokens in max length
		self.max_len_single_sentence = self.max_len - 2
		self.max_len_sentences_pair = self.max_len - 3

		if not os.path.isfile(vocab_file):
		raise ValueError(
		"Can't find a vocab file at path '{}'.".format(vocab_file))
		self.vocab = load_vocab(vocab_file)
		self.highest_unused_index = max(
		[i for i, v in enumerate(self.vocab.keys()) if v.startswith("[unused")])
		self.ids_to_tokens = collections.OrderedDict(
		[(ids, tok) for tok, ids in self.vocab.items()])
		self.basic_tokenizer = BasicSmilesTokenizer()
		self.init_kwargs["max_len"] = self.max_len

		@property
		def vocab_size(self):
		return len(self.vocab)

		@property
		def vocab_list(self):
		return list(self.vocab.keys())

		def _tokenize(self, text: str):
		"""
		Tokenize a string into a list of tokens.

		Parameters
		----------
		text: str
		Input string sequence to be tokenized.
		"""

		split_tokens = [token for token in self.basic_tokenizer.tokenize(text)]
		return split_tokens

		def _convert_token_to_id(self, token):
		"""
		Converts a token (str/unicode) in an id using the vocab.

		Parameters
		----------
		token: str
		String token from a larger sequence to be converted to a numerical id.
		"""

		return self.vocab.get(token, self.vocab.get(self.unk_token))

		def _convert_id_to_token(self, index):
		"""
		Converts an index (integer) in a token (string/unicode) using the vocab.

		Parameters
		----------
		index: int
		Integer index to be converted back to a string-based token as part of a larger sequence.
		"""

		return self.ids_to_tokens.get(index, self.unk_token)

		def convert_tokens_to_string(self, tokens: List[str]):
		""" Converts a sequence of tokens (string) in a single string.

		Parameters
		----------
		tokens: List[str]
		List of tokens for a given string sequence.

		Returns
		-------
		out_string: str
		Single string from combined tokens.
		"""

		out_string: str = " ".join(tokens).replace(" ##", "").strip()
		return out_string

		def add_special_tokens_ids_single_sequence(self, token_ids: List[int]):
		"""
		Adds special tokens to the a sequence for sequence classification tasks.
		A BERT sequence has the following format: [CLS] X [SEP]

		Parameters
		----------

		token_ids: list[int]
		list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
		"""

		return [self.cls_token_id] + token_ids + [self.sep_token_id]

		def add_special_tokens_single_sequence(self, tokens: List[str]):
		"""
		Adds special tokens to the a sequence for sequence classification tasks.
		A BERT sequence has the following format: [CLS] X [SEP]

		Parameters
		----------
		tokens: List[str]
		List of tokens for a given string sequence.

		"""
		return [self.cls_token] + tokens + [self.sep_token]

		def add_special_tokens_ids_sequence_pair(self, token_ids_0: List[int],
		token_ids_1: List[int]) -> List[int]:
		"""
		Adds special tokens to a sequence pair for sequence classification tasks.
		A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]

		Parameters
		----------
		token_ids_0: List[int]
		List of ids for the first string sequence in the sequence pair (A).

		token_ids_1: List[int]
		List of tokens for the second string sequence in the sequence pair (B).
		"""

		sep = [self.sep_token_id]
		cls = [self.cls_token_id]

		return cls + token_ids_0 + sep + token_ids_1 + sep

		def add_padding_tokens(self,
		token_ids: List[int],
		length: int,
		right: bool = True) -> List[int]:
		"""
		Adds padding tokens to return a sequence of length max_length.
		By default padding tokens are added to the right of the sequence.

		Parameters
		----------
		token_ids: list[int]
		list of tokenized input ids. Can be obtained using the encode or encode_plus methods.

		length: int

		right: bool (True by default)

		Returns
		----------
		token_ids :
		list of tokenized input ids. Can be obtained using the encode or encode_plus methods.

		padding: int
		Integer to be added as padding token

		"""
		padding = [self.pad_token_id] * (length - len(token_ids))

		if right:
		return token_ids + padding
		else:
		return padding + token_ids

		def save_vocabulary(
		self, vocab_path: str
		): # -> tuple[str]: doctest issue raised with this return type annotation
		"""
		Save the tokenizer vocabulary to a file.

		Parameters
		----------
		vocab_path: obj: str
		The directory in which to save the SMILES character per line vocabulary file.
		Default vocab file is found in deepchem/feat/tests/data/vocab.txt

		Returns
		----------
		vocab_file: :obj:`Tuple(str)`:
		Paths to the files saved.
		typle with string to a SMILES character per line vocabulary file.
		Default vocab file is found in deepchem/feat/tests/data/vocab.txt

		"""
		index = 0
		if os.path.isdir(vocab_path):
		vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
		else:
		vocab_file = vocab_path
		with open(vocab_file, "w", encoding="utf-8") as writer:
		for token, token_index in sorted(
		self.vocab.items(), key=lambda kv: kv[1]):
		if index != token_index:
		logger.warning(
		"Saving vocabulary to {}: vocabulary indices are not consecutive."
		" Please check that the vocabulary is not corrupted!".format(
		vocab_file))
		index = token_index
		writer.write(token + "\n")
		index += 1
		return (vocab_file,)


		class BasicSmilesTokenizer(object):
		"""

		Run basic SMILES tokenization using a regex pattern developed by Schwaller et. al. This tokenizer is to be used
		when a tokenizer that does not require the transformers library by HuggingFace is required.

		Examples
		--------
		>>> from deepchem.feat.smiles_tokenizer import BasicSmilesTokenizer

		>>> tokenizer = BasicSmilesTokenizer()
		>>> print(tokenizer.tokenize("CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1"))
		['C', 'C', 'C', '(', 'C', 'C', ')', 'C', 'O', 'C', '(', '=', 'O', ')', '[C@H]', '(', 'C', ')', 'N', '[P@]', '(', '=', 'O', ')', '(', 'O', 'C', '[C@H]', '1', 'O', '[C@]', '(', 'C', 'N', ')', '(', '[C@H]', '(', 'O', ')', '[C@@H]', '1', 'O', ')', 'C', '1', '=', 'C', 'C', '=', 'C', '2', 'N', '1', 'N', '=', 'C', 'N', '=', 'C', '2', 'N', ')', 'O', 'C', '1', '=', 'C', 'C', '=', 'C', 'C', '=', 'C', '1']


		References
		----------
		.. [1] Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee
		ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction
		1572-1583 DOI: 10.1021/acscentsci.9b00576

		"""

		def __init__(self, regex_pattern: str = SMI_REGEX_PATTERN):
		""" Constructs a BasicSMILESTokenizer.
		Parameters
		----------

		regex: string
		SMILES token regex

		"""
		self.regex_pattern = regex_pattern
		self.regex = re.compile(self.regex_pattern)

		def tokenize(self, text):
		""" Basic Tokenization of a SMILES.
		"""
		tokens = [token for token in self.regex.findall(text)]
		return tokens


		def load_vocab(vocab_file):
		"""Loads a vocabulary file into a dictionary."""
		vocab = collections.OrderedDict()
		with open(vocab_file, "r", encoding="utf-8") as reader:
		tokens = reader.readlines()
		for index, token in enumerate(tokens):
		token = token.rstrip("\n")
		vocab[token] = index
		return vocab

Admin message