Merge pull request #2113 from seyonechithrananda/chemberta-tutorial (9a76353f) · Commits · 钟慕尧 / deepchem

deepchem/feat/init.py

+13 −0

Original line number	Diff line number	Diff line
		@@ -34,3 +34,16 @@ from deepchem.feat.molecule_featurizers import MolGraphConvFeaturizer
		from deepchem.feat.material_featurizers import ElementPropertyFingerprint
		from deepchem.feat.material_featurizers import SineCoulombMatrix
		from deepchem.feat.material_featurizers import CGCNNFeaturizer

		try:
		from logging import getLogger
		logger = getLogger(__name__)
		import transformers
		from transformers import BertTokenizer

		from deepchem.feat.smiles_tokenizer import SmilesTokenizer
		from deepchem.feat.smiles_tokenizer import BasicSmilesTokenizer
		except ModuleNotFoundError:
		logger.warning(
		"HuggingFace transformers is not available. Please install using 'pip install transformers' to use the SmilesTokenizer"
		)

deepchem/feat/smiles_tokenizer.py

0 → 100644

+348 −0

Original line number	Diff line number	Diff line
		# Requriments - transformers, tokenizers
		# Right now, the Smiles Tokenizer uses an exiesting vocab file from rxnfp that is fairly comprehensive and from the USPTO dataset.
		# The vocab may be expanded in the near future

		import collections
		import logging
		import os
		import re
		import numpy as np
		import pkg_resources
		import typing
		from typing import List
		from transformers import BertTokenizer
		from logging import getLogger

		logger = getLogger(__name__)

		try:
		from transformers import BertTokenizer
		except ModuleNotFoundError:
		logger.warning(
		"HuggingFace transformers is not available. Please install using 'pip install transformers' to use the SmilesTokenizer"
		)
		"""
		SMI_REGEX_PATTERN: str
		SMILES regex pattern for tokenization. Designed by Schwaller et. al.

		References

		.. [1] Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee
		ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction
		1572-1583 DOI: 10.1021/acscentsci.9b00576

		"""

		SMI_REGEX_PATTERN = r"""(\[[^\]]+]\|Br?\|Cl?\|N\|O\|S\|P\|F\|I\|b\|c\|n\|o\|s\|p\|$\|$\|\.\|=\|
		#\|-\|\+\|\\\|\/\|:\|~\|@\|\?\|>>?\|\*\|\$\|\%[0-9]{2}\|[0-9])"""

		# add vocab_file dict
		VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}


		def get_default_tokenizer():
		default_vocab_path = (pkg_resources.resource_filename("deepchem",
		"feat/tests/vocab.txt"))
		return SmilesTokenizer(default_vocab_path)


		class SmilesTokenizer(BertTokenizer):
		"""
		Creates the SmilesTokenizer class. The tokenizer heavily inherits from the BertTokenizer
		implementation found in Huggingface's transformers library. It runs a WordPiece tokenization
		algorithm over SMILES strings using the tokenisation SMILES regex developed by Schwaller et. al.

		Please see https://github.com/huggingface/transformers
		and https://github.com/rxn4chemistry/rxnfp for more details.

		Examples
		--------

		>>> from deepchem.feat.smiles_tokenizer import SmilesTokenizer

		>>> current_dir = os.path.dirname(os.path.realpath(__file__))
		>>> vocab_path = os.path.join(current_dir, 'tests/data', 'vocab.txt')

		>>> tokenizer = SmilesTokenizer(vocab_path)
		>>> print(tokenizer.encode("CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1"))
		[12, 16, 16, 16, 17, 16, 16, 18, 16, 19, 16, 17, 22, 19, 18, 33, 17, 16, 18, 23, 181, 17, 22, 19, 18, 17, 19, 16, 33, 20, 19, 55, 17, 16, 23, 18, 17, 33, 17, 19, 18, 35, 20, 19, 18, 16, 20, 22, 16, 16, 22, 16, 21, 23, 20, 23, 22, 16, 23, 22, 16, 21, 23, 18, 19, 16, 20, 22, 16, 16, 22, 16, 16, 22, 16, 20, 13]


		References
		----------
		.. [1] Schwaller, Philippe; Probst, Daniel; Vaucher, Alain C.; Nair, Vishnu H; Kreutter, David;
		Laino, Teodoro; et al. (2019): Mapping the Space of Chemical Reactions using Attention-Based Neural
		Networks. ChemRxiv. Preprint. https://doi.org/10.26434/chemrxiv.9897365.v3
		Notes
		----
		This class requires huggingface's transformers and tokenizers libraries to be installed.

		"""
		vocab_files_names = VOCAB_FILES_NAMES

		def __init__(
		self,
		vocab_file: str = '',
		# unk_token="[UNK]",
		# sep_token="[SEP]",
		# pad_token="[PAD]",
		# cls_token="[CLS]",
		# mask_token="[MASK]",
		**kwargs):
		"""Constructs a SmilesTokenizer.

		Parameters
		----------
		vocab_file: str
		Path to a SMILES character per line vocabulary file.
		Default vocab file is found in deepchem/feat/tests/data/vocab.txt
		"""

		super().__init__(vocab_file, **kwargs)
		# take into account special tokens in max length
		self.max_len_single_sentence = self.max_len - 2
		self.max_len_sentences_pair = self.max_len - 3

		if not os.path.isfile(vocab_file):
		raise ValueError(
		"Can't find a vocab file at path '{}'.".format(vocab_file))
		self.vocab = load_vocab(vocab_file)
		self.highest_unused_index = max(
		[i for i, v in enumerate(self.vocab.keys()) if v.startswith("[unused")])
		self.ids_to_tokens = collections.OrderedDict(
		[(ids, tok) for tok, ids in self.vocab.items()])
		self.basic_tokenizer = BasicSmilesTokenizer()
		self.init_kwargs["max_len"] = self.max_len

		@property
		def vocab_size(self):
		return len(self.vocab)

		@property
		def vocab_list(self):
		return list(self.vocab.keys())

		def _tokenize(self, text: str):
		"""
		Tokenize a string into a list of tokens.

		Parameters
		----------
		text: str
		Input string sequence to be tokenized.
		"""

		split_tokens = [token for token in self.basic_tokenizer.tokenize(text)]
		return split_tokens

		def _convert_token_to_id(self, token):
		"""
		Converts a token (str/unicode) in an id using the vocab.

		Parameters
		----------
		token: str
		String token from a larger sequence to be converted to a numerical id.
		"""

		return self.vocab.get(token, self.vocab.get(self.unk_token))

		def _convert_id_to_token(self, index):
		"""
		Converts an index (integer) in a token (string/unicode) using the vocab.

		Parameters
		----------
		index: int
		Integer index to be converted back to a string-based token as part of a larger sequence.
		"""

		return self.ids_to_tokens.get(index, self.unk_token)

		def convert_tokens_to_string(self, tokens: List[str]):
		""" Converts a sequence of tokens (string) in a single string.

		Parameters
		----------
		tokens: List[str]
		List of tokens for a given string sequence.

		Returns
		-------
		out_string: str
		Single string from combined tokens.
		"""

		out_string: str = " ".join(tokens).replace(" ##", "").strip()
		return out_string

		def add_special_tokens_ids_single_sequence(self, token_ids: List[int]):
		"""
		Adds special tokens to the a sequence for sequence classification tasks.
		A BERT sequence has the following format: [CLS] X [SEP]

		Parameters
		----------

		token_ids: list[int]
		list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
		"""

		return [self.cls_token_id] + token_ids + [self.sep_token_id]

		def add_special_tokens_single_sequence(self, tokens: List[str]):
		"""
		Adds special tokens to the a sequence for sequence classification tasks.
		A BERT sequence has the following format: [CLS] X [SEP]

		Parameters
		----------
		tokens: List[str]
		List of tokens for a given string sequence.

		"""
		return [self.cls_token] + tokens + [self.sep_token]

		def add_special_tokens_ids_sequence_pair(self, token_ids_0: List[int],
		token_ids_1: List[int]) -> List[int]:
		"""
		Adds special tokens to a sequence pair for sequence classification tasks.
		A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]

		Parameters
		----------
		token_ids_0: List[int]
		List of ids for the first string sequence in the sequence pair (A).

		token_ids_1: List[int]
		List of tokens for the second string sequence in the sequence pair (B).
		"""

		sep = [self.sep_token_id]
		cls = [self.cls_token_id]

		return cls + token_ids_0 + sep + token_ids_1 + sep

		def add_padding_tokens(self,
		token_ids: List[int],
		length: int,
		right: bool = True) -> List[int]:
		"""
		Adds padding tokens to return a sequence of length max_length.
		By default padding tokens are added to the right of the sequence.

		Parameters
		----------
		token_ids: list[int]
		list of tokenized input ids. Can be obtained using the encode or encode_plus methods.

		length: int

		right: bool (True by default)

		Returns
		----------
		token_ids :
		list of tokenized input ids. Can be obtained using the encode or encode_plus methods.

		padding: int
		Integer to be added as padding token

		"""
		padding = [self.pad_token_id] * (length - len(token_ids))

		if right:
		return token_ids + padding
		else:
		return padding + token_ids

		def save_vocabulary(
		self, vocab_path: str
		): # -> tuple[str]: doctest issue raised with this return type annotation
		"""
		Save the tokenizer vocabulary to a file.

		Parameters
		----------
		vocab_path: obj: str
		The directory in which to save the SMILES character per line vocabulary file.
		Default vocab file is found in deepchem/feat/tests/data/vocab.txt

		Returns
		----------
		vocab_file: :obj:`Tuple(str)`:
		Paths to the files saved.
		typle with string to a SMILES character per line vocabulary file.
		Default vocab file is found in deepchem/feat/tests/data/vocab.txt

		"""
		index = 0
		if os.path.isdir(vocab_path):
		vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
		else:
		vocab_file = vocab_path
		with open(vocab_file, "w", encoding="utf-8") as writer:
		for token, token_index in sorted(
		self.vocab.items(), key=lambda kv: kv[1]):
		if index != token_index:
		logger.warning(
		"Saving vocabulary to {}: vocabulary indices are not consecutive."
		" Please check that the vocabulary is not corrupted!".format(
		vocab_file))
		index = token_index
		writer.write(token + "\n")
		index += 1
		return (vocab_file,)


		class BasicSmilesTokenizer(object):
		"""

		Run basic SMILES tokenization using a regex pattern developed by Schwaller et. al. This tokenizer is to be used
		when a tokenizer that does not require the transformers library by HuggingFace is required.

		Examples
		--------
		>>> from deepchem.feat.smiles_tokenizer import BasicSmilesTokenizer

		>>> tokenizer = BasicSmilesTokenizer()
		>>> print(tokenizer.tokenize("CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1"))
		['C', 'C', 'C', '(', 'C', 'C', ')', 'C', 'O', 'C', '(', '=', 'O', ')', '[C@H]', '(', 'C', ')', 'N', '[P@]', '(', '=', 'O', ')', '(', 'O', 'C', '[C@H]', '1', 'O', '[C@]', '(', 'C', 'N', ')', '(', '[C@H]', '(', 'O', ')', '[C@@H]', '1', 'O', ')', 'C', '1', '=', 'C', 'C', '=', 'C', '2', 'N', '1', 'N', '=', 'C', 'N', '=', 'C', '2', 'N', ')', 'O', 'C', '1', '=', 'C', 'C', '=', 'C', 'C', '=', 'C', '1']


		References
		----------
		.. [1] Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee
		ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction
		1572-1583 DOI: 10.1021/acscentsci.9b00576

		"""

		def __init__(self, regex_pattern: str = SMI_REGEX_PATTERN):
		""" Constructs a BasicSMILESTokenizer.
		Parameters
		----------

		regex: string
		SMILES token regex

		"""
		self.regex_pattern = regex_pattern
		self.regex = re.compile(self.regex_pattern)

		def tokenize(self, text):
		""" Basic Tokenization of a SMILES.
		"""
		tokens = [token for token in self.regex.findall(text)]
		return tokens


		def load_vocab(vocab_file):
		"""Loads a vocabulary file into a dictionary."""
		vocab = collections.OrderedDict()
		with open(vocab_file, "r", encoding="utf-8") as reader:
		tokens = reader.readlines()
		for index, token in enumerate(tokens):
		token = token.rstrip("\n")
		vocab[token] = index
		return vocab

deepchem/feat/tests/data/vocab.txt

0 → 100644

+591 −0

Original line number	Diff line number	Diff line
		[PAD]
		[unused1]
		[unused2]
		[unused3]
		[unused4]
		[unused5]
		[unused6]
		[unused7]
		[unused8]
		[unused9]
		[unused10]
		[UNK]
		[CLS]
		[SEP]
		[MASK]
		c
		C
		(
		)
		O
		1
		2
		=
		N
		.
		n
		3
		F
		Cl
		>>
		~
		-
		4
		[C@H]
		S
		[C@@H]
		[O-]
		Br
		#
		/
		[nH]
		[N+]
		s
		5
		o
		P
		[Na+]
		[Si]
		I
		[Na]
		[Pd]
		[K+]
		[K]
		[P]
		B
		[C@]
		[C@@]
		[Cl-]
		6
		[OH-]
		\
		[N-]
		[Li]
		[H]
		[2H]
		[NH4+]
		[c-]
		[P-]
		[Cs+]
		[Li+]
		[Cs]
		[NaH]
		[H-]
		[O+]
		[BH4-]
		[Cu]
		7
		[Mg]
		[Fe+2]
		[n+]
		[Sn]
		[BH-]
		[Pd+2]
		[CH]
		[I-]
		[Br-]
		[C-]
		[Zn]
		[B-]
		[F-]
		[Al]
		[P+]
		[BH3-]
		[Fe]
		[C]
		[AlH4]
		[Ni]
		[SiH]
		8
		[Cu+2]
		[Mn]
		[AlH]
		[nH+]
		[AlH4-]
		[O-2]
		[Cr]
		[Mg+2]
		[NH3+]
		[S@]
		[Pt]
		[Al+3]
		[S@@]
		[S-]
		[Ti]
		[Zn+2]
		[PH]
		[NH2+]
		[Ru]
		[Ag+]
		[S+]
		[I+3]
		[NH+]
		[Ca+2]
		[Ag]
		9
		[Os]
		[Se]
		[SiH2]
		[Ca]
		[Ti+4]
		[Ac]
		[Cu+]
		[S]
		[Rh]
		[Cl+3]
		[cH-]
		[Zn+]
		[O]
		[Cl+]
		[SH]
		[H+]
		[Pd+]
		[se]
		[PH+]
		[I]
		[Pt+2]
		[C+]
		[Mg+]
		[Hg]
		[W]
		[SnH]
		[SiH3]
		[Fe+3]
		[NH]
		[Mo]
		[CH2+]
		%10
		[CH2-]
		[CH2]
		[n-]
		[Ce+4]
		[NH-]
		[Co]
		[I+]
		[PH2]
		[Pt+4]
		[Ce]
		[B]
		[Sn+2]
		[Ba+2]
		%11
		[Fe-3]
		[18F]
		[SH-]
		[Pb+2]
		[Os-2]
		[Zr+4]
		[N]
		[Ir]
		[Bi]
		[Ni+2]
		[P@]
		[Co+2]
		[s+]
		[As]
		[P+3]
		[Hg+2]
		[Yb+3]
		[CH-]
		[Zr+2]
		[Mn+2]
		[CH+]
		[In]
		[KH]
		[Ce+3]
		[Zr]
		[AlH2-]
		[OH2+]
		[Ti+3]
		[Rh+2]
		[Sb]
		[S-2]
		%12
		[P@@]
		[Si@H]
		[Mn+4]
		p
		[Ba]
		[NH2-]
		[Ge]
		[Pb+4]
		[Cr+3]
		[Au]
		[LiH]
		[Sc+3]
		[o+]
		[Rh-3]
		%13
		[Br]
		[Sb-]
		[S@+]
		[I+2]
		[Ar]
		[V]
		[Cu-]
		[Al-]
		[Te]
		[13c]
		[13C]
		[Cl]
		[PH4+]
		[SiH4]
		[te]
		[CH3-]
		[S@@+]
		[Rh+3]
		[SH+]
		[Bi+3]
		[Br+2]
		[La]
		[La+3]
		[Pt-2]
		[N@@]
		[PH3+]
		[N@]
		[Si+4]
		[Sr+2]
		[Al+]
		[Pb]
		[SeH]
		[Si-]
		[V+5]
		[Y+3]
		[Re]
		[Ru+]
		[Sm]
		*
		[3H]
		[NH2]
		[Ag-]
		[13CH3]
		[OH+]
		[Ru+3]
		[OH]
		[Gd+3]
		[13CH2]
		[In+3]
		[Si@@]
		[Si@]
		[Ti+2]
		[Sn+]
		[Cl+2]
		[AlH-]
		[Pd-2]
		[SnH3]
		[B+3]
		[Cu-2]
		[Nd+3]
		[Pb+3]
		[13cH]
		[Fe-4]
		[Ga]
		[Sn+4]
		[Hg+]
		[11CH3]
		[Hf]
		[Pr]
		[Y]
		[S+2]
		[Cd]
		[Cr+6]
		[Zr+3]
		[Rh+]
		[CH3]
		[N-3]
		[Hf+2]
		[Th]
		[Sb+3]
		%14
		[Cr+2]
		[Ru+2]
		[Hf+4]
		[14C]
		[Ta]
		[Tl+]
		[B+]
		[Os+4]
		[PdH2]
		[Pd-]
		[Cd+2]
		[Co+3]
		[S+4]
		[Nb+5]
		[123I]
		[c+]
		[Rb+]
		[V+2]
		[CH3+]
		[Ag+2]
		[cH+]
		[Mn+3]
		[Se-]
		[As-]
		[Eu+3]
		[SH2]
		[Sm+3]
		[IH+]
		%15
		[OH3+]
		[PH3]
		[IH2+]
		[SH2+]
		[Ir+3]
		[AlH3]
		[Sc]
		[Yb]
		[15NH2]
		[Lu]
		[sH+]
		[Gd]
		[18F-]
		[SH3+]
		[SnH4]
		[TeH]
		[Si@@H]
		[Ga+3]
		[CaH2]
		[Tl]
		[Ta+5]
		[GeH]
		[Br+]
		[Sr]
		[Tl+3]
		[Sm+2]
		[PH5]
		%16
		[N@@+]
		[Au+3]
		[C-4]
		[Nd]
		[Ti+]
		[IH]
		[N@+]
		[125I]
		[Eu]
		[Sn+3]
		[Nb]
		[Er+3]
		[123I-]
		[14c]
		%17
		[SnH2]
		[YH]
		[Sb+5]
		[Pr+3]
		[Ir+]
		[N+3]
		[AlH2]
		[19F]
		%18
		[Tb]
		[14CH]
		[Mo+4]
		[Si+]
		[BH]
		[Be]
		[Rb]
		[pH]
		%19
		%20
		[Xe]
		[Ir-]
		[Be+2]
		[C+4]
		[RuH2]
		[15NH]
		[U+2]
		[Au-]
		%21
		%22
		[Au+]
		[15n]
		[Al+2]
		[Tb+3]
		[15N]
		[V+3]
		[W+6]
		[14CH3]
		[Cr+4]
		[ClH+]
		b
		[Ti+6]
		[Nd+]
		[Zr+]
		[PH2+]
		[Fm]
		[N@H+]
		[RuH]
		[Dy+3]
		%23
		[Hf+3]
		[W+4]
		[11C]
		[13CH]
		[Er]
		[124I]
		[LaH]
		[F]
		[siH]
		[Ga+]
		[Cm]
		[GeH3]
		[IH-]
		[U+6]
		[SeH+]
		[32P]
		[SeH-]
		[Pt-]
		[Ir+2]
		[se+]
		[U]
		[F+]
		[BH2]
		[As+]
		[Cf]
		[ClH2+]
		[Ni+]
		[TeH3]
		[SbH2]
		[Ag+3]
		%24
		[18O]
		[PH4]
		[Os+2]
		[Na-]
		[Sb+2]
		[V+4]
		[Ho+3]
		[68Ga]
		[PH-]
		[Bi+2]
		[Ce+2]
		[Pd+3]
		[99Tc]
		[13C@@H]
		[Fe+6]
		[c]
		[GeH2]
		[10B]
		[Cu+3]
		[Mo+2]
		[Cr+]
		[Pd+4]
		[Dy]
		[AsH]
		[Ba+]
		[SeH2]
		[In+]
		[TeH2]
		[BrH+]
		[14cH]
		[W+]
		[13C@H]
		[AsH2]
		[In+2]
		[N+2]
		[N@@H+]
		[SbH]
		[60Co]
		[AsH4+]
		[AsH3]
		[18OH]
		[Ru-2]
		[Na-2]
		[CuH2]
		[31P]
		[Ti+5]
		[35S]
		[P@@H]
		[ArH]
		[Co+]
		[Zr-2]
		[BH2-]
		[131I]
		[SH5]
		[VH]
		[B+2]
		[Yb+2]
		[14C@H]
		[211At]
		[NH3+2]
		[IrH]
		[IrH2]
		[Rh-]
		[Cr-]
		[Sb+]
		[Ni+3]
		[TaH3]
		[Tl+2]
		[64Cu]
		[Tc]
		[Cd+]
		[1H]
		[15nH]
		[AlH2+]
		[FH+2]
		[BiH3]
		[Ru-]
		[Mo+6]
		[AsH+]
		[BaH2]
		[BaH]
		[Fe+4]
		[229Th]
		[Th+4]
		[As+3]
		[NH+3]
		[P@H]
		[Li-]
		[7NaH]
		[Bi+]
		[PtH+2]
		[p-]
		[Re+5]
		[NiH]
		[Ni-]
		[Xe+]
		[Ca+]
		[11c]
		[Rh+4]
		[AcH]
		[HeH]
		[Sc+2]
		[Mn+]
		[UH]
		[14CH2]
		[SiH4+]
		[18OH2]
		[Ac-]
		[Re+4]
		[118Sn]
		[153Sm]
		[P+2]
		[9CH]
		[9CH3]
		[Y-]
		[NiH2]
		[Si+2]
		[Mn+6]
		[ZrH2]
		[C-2]
		[Bi+5]
		[24NaH]
		[Fr]
		[15CH]
		[Se+]
		[At]
		[P-3]
		[124I-]
		[CuH2-]
		[Nb+4]
		[Nb+3]
		[MgH]
		[Ir+4]
		[67Ga+3]
		[67Ga]
		[13N]
		[15OH2]
		[2NH]
		[Ho]
		[Cn]
		No newline at end of file

deepchem/feat/tests/test_smiles_tokenizer.py

0 → 100644

+31 −0

Original line number	Diff line number	Diff line
		# Requirements - transformers, tokenizers
		import os
		from unittest import TestCase
		from deepchem.feat.smiles_tokenizer import SmilesTokenizer
		from transformers import RobertaForMaskedLM


		class TestSmilesTokenizer(TestCase):
		"""Tests the SmilesTokenizer to load the USPTO vocab file and a ChemBERTa Masked LM model with pre-trained weights.."""

		def test_tokenize(self):
		current_dir = os.path.dirname(os.path.realpath(__file__))
		vocab_path = os.path.join(current_dir, 'data', 'vocab.txt')
		tokenized_smiles = [
		12, 16, 16, 16, 17, 16, 16, 18, 16, 19, 16, 17, 22, 19, 18, 33, 17, 16,
		18, 23, 181, 17, 22, 19, 18, 17, 19, 16, 33, 20, 19, 55, 17, 16, 23, 18,
		17, 33, 17, 19, 18, 35, 20, 19, 18, 16, 20, 22, 16, 16, 22, 16, 21, 23,
		20, 23, 22, 16, 23, 22, 16, 21, 23, 18, 19, 16, 20, 22, 16, 16, 22, 16,
		16, 22, 16, 20, 13
		]

		model = RobertaForMaskedLM.from_pretrained(
		'seyonec/SMILES_tokenized_PubChem_shard00_50k')
		model.num_parameters()

		tokenizer = SmilesTokenizer(
		vocab_path, max_len=model.config.max_position_embeddings)

		assert tokenized_smiles == tokenizer.encode(
		"CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1"
		)

docs/index.rst

+1 −0

Original line number	Diff line number	Diff line
		@@ -137,6 +137,7 @@ discussions about research, development or any general questions. If you'd like
		dataclasses
		moleculenet
		featurizers
		tokenizers
		splitters
		transformers
		models

Admin message