Unverified Commit 9a76353f authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #2113 from seyonechithrananda/chemberta-tutorial

[WIP] Smiles Tokenizer in dc.feat
parents 3d257a0c 1314bd1c
Loading
Loading
Loading
Loading
+13 −0
Original line number Diff line number Diff line
@@ -34,3 +34,16 @@ from deepchem.feat.molecule_featurizers import MolGraphConvFeaturizer
from deepchem.feat.material_featurizers import ElementPropertyFingerprint
from deepchem.feat.material_featurizers import SineCoulombMatrix
from deepchem.feat.material_featurizers import CGCNNFeaturizer

try:
  from logging import getLogger
  logger = getLogger(__name__)
  import transformers
  from transformers import BertTokenizer

  from deepchem.feat.smiles_tokenizer import SmilesTokenizer
  from deepchem.feat.smiles_tokenizer import BasicSmilesTokenizer
except ModuleNotFoundError:
  logger.warning(
      "HuggingFace transformers is not available. Please install using 'pip install transformers' to use the SmilesTokenizer"
  )
+348 −0
Original line number Diff line number Diff line
# Requriments - transformers, tokenizers
# Right now, the Smiles Tokenizer uses an exiesting vocab file from rxnfp that is fairly comprehensive and from the USPTO dataset.
# The vocab may be expanded in the near future

import collections
import logging
import os
import re
import numpy as np
import pkg_resources
import typing
from typing import List
from transformers import BertTokenizer
from logging import getLogger

logger = getLogger(__name__)

try:
  from transformers import BertTokenizer
except ModuleNotFoundError:
  logger.warning(
      "HuggingFace transformers is not available. Please install using 'pip install transformers' to use the SmilesTokenizer"
  )
"""
SMI_REGEX_PATTERN: str
    SMILES regex pattern for tokenization. Designed by Schwaller et. al. 

References

.. [1]  Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee
        ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction
        1572-1583 DOI: 10.1021/acscentsci.9b00576

"""

SMI_REGEX_PATTERN = r"""(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|
#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"""

# add vocab_file dict
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}


def get_default_tokenizer():
  default_vocab_path = (pkg_resources.resource_filename("deepchem",
                                                        "feat/tests/vocab.txt"))
  return SmilesTokenizer(default_vocab_path)


class SmilesTokenizer(BertTokenizer):
  """
    Creates the SmilesTokenizer class. The tokenizer heavily inherits from the BertTokenizer
    implementation found in Huggingface's transformers library. It runs a WordPiece tokenization
    algorithm over SMILES strings using the tokenisation SMILES regex developed by Schwaller et. al.

    Please see https://github.com/huggingface/transformers
    and https://github.com/rxn4chemistry/rxnfp for more details.

    Examples
    --------

    >>> from deepchem.feat.smiles_tokenizer import SmilesTokenizer

    >>> current_dir = os.path.dirname(os.path.realpath(__file__))
    >>> vocab_path = os.path.join(current_dir, 'tests/data', 'vocab.txt')

    >>> tokenizer = SmilesTokenizer(vocab_path)
    >>> print(tokenizer.encode("CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1"))
    [12, 16, 16, 16, 17, 16, 16, 18, 16, 19, 16, 17, 22, 19, 18, 33, 17, 16, 18, 23, 181, 17, 22, 19, 18, 17, 19, 16, 33, 20, 19, 55, 17, 16, 23, 18, 17, 33, 17, 19, 18, 35, 20, 19, 18, 16, 20, 22, 16, 16, 22, 16, 21, 23, 20, 23, 22, 16, 23, 22, 16, 21, 23, 18, 19, 16, 20, 22, 16, 16, 22, 16, 16, 22, 16, 20, 13]


    References
    ----------
    .. [1]  Schwaller, Philippe; Probst, Daniel; Vaucher, Alain C.; Nair, Vishnu H; Kreutter, David;
            Laino, Teodoro; et al. (2019): Mapping the Space of Chemical Reactions using Attention-Based Neural
            Networks. ChemRxiv. Preprint. https://doi.org/10.26434/chemrxiv.9897365.v3
    Notes
    ----
    This class requires huggingface's transformers and tokenizers libraries to be installed.

    """
  vocab_files_names = VOCAB_FILES_NAMES

  def __init__(
      self,
      vocab_file: str = '',
      # unk_token="[UNK]",
      # sep_token="[SEP]",
      # pad_token="[PAD]",
      # cls_token="[CLS]",
      # mask_token="[MASK]",
      **kwargs):
    """Constructs a SmilesTokenizer.

        Parameters
        ----------
        vocab_file: str
            Path to a SMILES character per line vocabulary file.
            Default vocab file is found in deepchem/feat/tests/data/vocab.txt
        """

    super().__init__(vocab_file, **kwargs)
    # take into account special tokens in max length
    self.max_len_single_sentence = self.max_len - 2
    self.max_len_sentences_pair = self.max_len - 3

    if not os.path.isfile(vocab_file):
      raise ValueError(
          "Can't find a vocab file at path '{}'.".format(vocab_file))
    self.vocab = load_vocab(vocab_file)
    self.highest_unused_index = max(
        [i for i, v in enumerate(self.vocab.keys()) if v.startswith("[unused")])
    self.ids_to_tokens = collections.OrderedDict(
        [(ids, tok) for tok, ids in self.vocab.items()])
    self.basic_tokenizer = BasicSmilesTokenizer()
    self.init_kwargs["max_len"] = self.max_len

  @property
  def vocab_size(self):
    return len(self.vocab)

  @property
  def vocab_list(self):
    return list(self.vocab.keys())

  def _tokenize(self, text: str):
    """
        Tokenize a string into a list of tokens.

        Parameters
        ----------
        text: str
            Input string sequence to be tokenized.
        """

    split_tokens = [token for token in self.basic_tokenizer.tokenize(text)]
    return split_tokens

  def _convert_token_to_id(self, token):
    """
        Converts a token (str/unicode) in an id using the vocab.

        Parameters
        ----------
        token: str
            String token from a larger sequence to be converted to a numerical id.
        """

    return self.vocab.get(token, self.vocab.get(self.unk_token))

  def _convert_id_to_token(self, index):
    """
        Converts an index (integer) in a token (string/unicode) using the vocab.

        Parameters
        ----------
        index: int
            Integer index to be converted back to a string-based token as part of a larger sequence.
        """

    return self.ids_to_tokens.get(index, self.unk_token)

  def convert_tokens_to_string(self, tokens: List[str]):
    """ Converts a sequence of tokens (string) in a single string.

        Parameters
        ----------
        tokens: List[str]
            List of tokens for a given string sequence.

        Returns
        -------
        out_string: str
            Single string from combined tokens.
        """

    out_string: str = " ".join(tokens).replace(" ##", "").strip()
    return out_string

  def add_special_tokens_ids_single_sequence(self, token_ids: List[int]):
    """
        Adds special tokens to the a sequence for sequence classification tasks.
        A BERT sequence has the following format: [CLS] X [SEP]

        Parameters
        ----------

        token_ids: list[int]
            list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
        """

    return [self.cls_token_id] + token_ids + [self.sep_token_id]

  def add_special_tokens_single_sequence(self, tokens: List[str]):
    """
        Adds special tokens to the a sequence for sequence classification tasks.
        A BERT sequence has the following format: [CLS] X [SEP]

        Parameters
        ----------
        tokens: List[str]
            List of tokens for a given string sequence.

        """
    return [self.cls_token] + tokens + [self.sep_token]

  def add_special_tokens_ids_sequence_pair(self, token_ids_0: List[int],
                                           token_ids_1: List[int]) -> List[int]:
    """
        Adds special tokens to a sequence pair for sequence classification tasks.
        A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]

        Parameters
        ----------
        token_ids_0: List[int]
            List of ids for the first string sequence in the sequence pair (A).

        token_ids_1: List[int]
            List of tokens for the second string sequence in the sequence pair (B).
        """

    sep = [self.sep_token_id]
    cls = [self.cls_token_id]

    return cls + token_ids_0 + sep + token_ids_1 + sep

  def add_padding_tokens(self,
                         token_ids: List[int],
                         length: int,
                         right: bool = True) -> List[int]:
    """
        Adds padding tokens to return a sequence of length max_length.
        By default padding tokens are added to the right of the sequence.

        Parameters
        ----------
        token_ids: list[int]
            list of tokenized input ids. Can be obtained using the encode or encode_plus methods.

        length: int

        right: bool (True by default)

        Returns
        ----------
        token_ids :
            list of tokenized input ids. Can be obtained using the encode or encode_plus methods.

        padding: int
            Integer to be added as padding token

        """
    padding = [self.pad_token_id] * (length - len(token_ids))

    if right:
      return token_ids + padding
    else:
      return padding + token_ids

  def save_vocabulary(
      self, vocab_path: str
  ):  # -> tuple[str]: doctest issue raised with this return type annotation
    """
        Save the tokenizer vocabulary to a file.

        Parameters
        ----------
        vocab_path: obj: str
            The directory in which to save the SMILES character per line vocabulary file.
            Default vocab file is found in deepchem/feat/tests/data/vocab.txt

        Returns
        ----------
        vocab_file: :obj:`Tuple(str)`:
            Paths to the files saved.
            typle with string to a SMILES character per line vocabulary file.
            Default vocab file is found in deepchem/feat/tests/data/vocab.txt

        """
    index = 0
    if os.path.isdir(vocab_path):
      vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
    else:
      vocab_file = vocab_path
    with open(vocab_file, "w", encoding="utf-8") as writer:
      for token, token_index in sorted(
          self.vocab.items(), key=lambda kv: kv[1]):
        if index != token_index:
          logger.warning(
              "Saving vocabulary to {}: vocabulary indices are not consecutive."
              " Please check that the vocabulary is not corrupted!".format(
                  vocab_file))
          index = token_index
        writer.write(token + "\n")
        index += 1
    return (vocab_file,)


class BasicSmilesTokenizer(object):
  """

    Run basic SMILES tokenization using a regex pattern developed by Schwaller et. al. This tokenizer is to be used
    when a tokenizer that does not require the transformers library by HuggingFace is required.

    Examples
    --------
    >>> from deepchem.feat.smiles_tokenizer import BasicSmilesTokenizer

    >>> tokenizer = BasicSmilesTokenizer()
    >>> print(tokenizer.tokenize("CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1"))
    ['C', 'C', 'C', '(', 'C', 'C', ')', 'C', 'O', 'C', '(', '=', 'O', ')', '[C@H]', '(', 'C', ')', 'N', '[P@]', '(', '=', 'O', ')', '(', 'O', 'C', '[C@H]', '1', 'O', '[C@]', '(', 'C', 'N', ')', '(', '[C@H]', '(', 'O', ')', '[C@@H]', '1', 'O', ')', 'C', '1', '=', 'C', 'C', '=', 'C', '2', 'N', '1', 'N', '=', 'C', 'N', '=', 'C', '2', 'N', ')', 'O', 'C', '1', '=', 'C', 'C', '=', 'C', 'C', '=', 'C', '1']


    References
    ----------
    .. [1]  Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee
            ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction
            1572-1583 DOI: 10.1021/acscentsci.9b00576

    """

  def __init__(self, regex_pattern: str = SMI_REGEX_PATTERN):
    """ Constructs a BasicSMILESTokenizer.
        Parameters
        ----------

        regex: string
            SMILES token regex

        """
    self.regex_pattern = regex_pattern
    self.regex = re.compile(self.regex_pattern)

  def tokenize(self, text):
    """ Basic Tokenization of a SMILES.
        """
    tokens = [token for token in self.regex.findall(text)]
    return tokens


def load_vocab(vocab_file):
  """Loads a vocabulary file into a dictionary."""
  vocab = collections.OrderedDict()
  with open(vocab_file, "r", encoding="utf-8") as reader:
    tokens = reader.readlines()
  for index, token in enumerate(tokens):
    token = token.rstrip("\n")
    vocab[token] = index
  return vocab
+591 −0
Original line number Diff line number Diff line
[PAD]
[unused1]
[unused2]
[unused3]
[unused4]
[unused5]
[unused6]
[unused7]
[unused8]
[unused9]
[unused10]
[UNK]
[CLS]
[SEP]
[MASK]
c
C
(
)
O
1
2
=
N
.
n
3
F
Cl
>>
~
-
4
[C@H]
S
[C@@H]
[O-]
Br
#
/
[nH]
[N+]
s
5
o
P
[Na+]
[Si]
I
[Na]
[Pd]
[K+]
[K]
[P]
B
[C@]
[C@@]
[Cl-]
6
[OH-]
\
[N-]
[Li]
[H]
[2H]
[NH4+]
[c-]
[P-]
[Cs+]
[Li+]
[Cs]
[NaH]
[H-]
[O+]
[BH4-]
[Cu]
7
[Mg]
[Fe+2]
[n+]
[Sn]
[BH-]
[Pd+2]
[CH]
[I-]
[Br-]
[C-]
[Zn]
[B-]
[F-]
[Al]
[P+]
[BH3-]
[Fe]
[C]
[AlH4]
[Ni]
[SiH]
8
[Cu+2]
[Mn]
[AlH]
[nH+]
[AlH4-]
[O-2]
[Cr]
[Mg+2]
[NH3+]
[S@]
[Pt]
[Al+3]
[S@@]
[S-]
[Ti]
[Zn+2]
[PH]
[NH2+]
[Ru]
[Ag+]
[S+]
[I+3]
[NH+]
[Ca+2]
[Ag]
9
[Os]
[Se]
[SiH2]
[Ca]
[Ti+4]
[Ac]
[Cu+]
[S]
[Rh]
[Cl+3]
[cH-]
[Zn+]
[O]
[Cl+]
[SH]
[H+]
[Pd+]
[se]
[PH+]
[I]
[Pt+2]
[C+]
[Mg+]
[Hg]
[W]
[SnH]
[SiH3]
[Fe+3]
[NH]
[Mo]
[CH2+]
%10
[CH2-]
[CH2]
[n-]
[Ce+4]
[NH-]
[Co]
[I+]
[PH2]
[Pt+4]
[Ce]
[B]
[Sn+2]
[Ba+2]
%11
[Fe-3]
[18F]
[SH-]
[Pb+2]
[Os-2]
[Zr+4]
[N]
[Ir]
[Bi]
[Ni+2]
[P@]
[Co+2]
[s+]
[As]
[P+3]
[Hg+2]
[Yb+3]
[CH-]
[Zr+2]
[Mn+2]
[CH+]
[In]
[KH]
[Ce+3]
[Zr]
[AlH2-]
[OH2+]
[Ti+3]
[Rh+2]
[Sb]
[S-2]
%12
[P@@]
[Si@H]
[Mn+4]
p
[Ba]
[NH2-]
[Ge]
[Pb+4]
[Cr+3]
[Au]
[LiH]
[Sc+3]
[o+]
[Rh-3]
%13
[Br]
[Sb-]
[S@+]
[I+2]
[Ar]
[V]
[Cu-]
[Al-]
[Te]
[13c]
[13C]
[Cl]
[PH4+]
[SiH4]
[te]
[CH3-]
[S@@+]
[Rh+3]
[SH+]
[Bi+3]
[Br+2]
[La]
[La+3]
[Pt-2]
[N@@]
[PH3+]
[N@]
[Si+4]
[Sr+2]
[Al+]
[Pb]
[SeH]
[Si-]
[V+5]
[Y+3]
[Re]
[Ru+]
[Sm]
*
[3H]
[NH2]
[Ag-]
[13CH3]
[OH+]
[Ru+3]
[OH]
[Gd+3]
[13CH2]
[In+3]
[Si@@]
[Si@]
[Ti+2]
[Sn+]
[Cl+2]
[AlH-]
[Pd-2]
[SnH3]
[B+3]
[Cu-2]
[Nd+3]
[Pb+3]
[13cH]
[Fe-4]
[Ga]
[Sn+4]
[Hg+]
[11CH3]
[Hf]
[Pr]
[Y]
[S+2]
[Cd]
[Cr+6]
[Zr+3]
[Rh+]
[CH3]
[N-3]
[Hf+2]
[Th]
[Sb+3]
%14
[Cr+2]
[Ru+2]
[Hf+4]
[14C]
[Ta]
[Tl+]
[B+]
[Os+4]
[PdH2]
[Pd-]
[Cd+2]
[Co+3]
[S+4]
[Nb+5]
[123I]
[c+]
[Rb+]
[V+2]
[CH3+]
[Ag+2]
[cH+]
[Mn+3]
[Se-]
[As-]
[Eu+3]
[SH2]
[Sm+3]
[IH+]
%15
[OH3+]
[PH3]
[IH2+]
[SH2+]
[Ir+3]
[AlH3]
[Sc]
[Yb]
[15NH2]
[Lu]
[sH+]
[Gd]
[18F-]
[SH3+]
[SnH4]
[TeH]
[Si@@H]
[Ga+3]
[CaH2]
[Tl]
[Ta+5]
[GeH]
[Br+]
[Sr]
[Tl+3]
[Sm+2]
[PH5]
%16
[N@@+]
[Au+3]
[C-4]
[Nd]
[Ti+]
[IH]
[N@+]
[125I]
[Eu]
[Sn+3]
[Nb]
[Er+3]
[123I-]
[14c]
%17
[SnH2]
[YH]
[Sb+5]
[Pr+3]
[Ir+]
[N+3]
[AlH2]
[19F]
%18
[Tb]
[14CH]
[Mo+4]
[Si+]
[BH]
[Be]
[Rb]
[pH]
%19
%20
[Xe]
[Ir-]
[Be+2]
[C+4]
[RuH2]
[15NH]
[U+2]
[Au-]
%21
%22
[Au+]
[15n]
[Al+2]
[Tb+3]
[15N]
[V+3]
[W+6]
[14CH3]
[Cr+4]
[ClH+]
b
[Ti+6]
[Nd+]
[Zr+]
[PH2+]
[Fm]
[N@H+]
[RuH]
[Dy+3]
%23
[Hf+3]
[W+4]
[11C]
[13CH]
[Er]
[124I]
[LaH]
[F]
[siH]
[Ga+]
[Cm]
[GeH3]
[IH-]
[U+6]
[SeH+]
[32P]
[SeH-]
[Pt-]
[Ir+2]
[se+]
[U]
[F+]
[BH2]
[As+]
[Cf]
[ClH2+]
[Ni+]
[TeH3]
[SbH2]
[Ag+3]
%24
[18O]
[PH4]
[Os+2]
[Na-]
[Sb+2]
[V+4]
[Ho+3]
[68Ga]
[PH-]
[Bi+2]
[Ce+2]
[Pd+3]
[99Tc]
[13C@@H]
[Fe+6]
[c]
[GeH2]
[10B]
[Cu+3]
[Mo+2]
[Cr+]
[Pd+4]
[Dy]
[AsH]
[Ba+]
[SeH2]
[In+]
[TeH2]
[BrH+]
[14cH]
[W+]
[13C@H]
[AsH2]
[In+2]
[N+2]
[N@@H+]
[SbH]
[60Co]
[AsH4+]
[AsH3]
[18OH]
[Ru-2]
[Na-2]
[CuH2]
[31P]
[Ti+5]
[35S]
[P@@H]
[ArH]
[Co+]
[Zr-2]
[BH2-]
[131I]
[SH5]
[VH]
[B+2]
[Yb+2]
[14C@H]
[211At]
[NH3+2]
[IrH]
[IrH2]
[Rh-]
[Cr-]
[Sb+]
[Ni+3]
[TaH3]
[Tl+2]
[64Cu]
[Tc]
[Cd+]
[1H]
[15nH]
[AlH2+]
[FH+2]
[BiH3]
[Ru-]
[Mo+6]
[AsH+]
[BaH2]
[BaH]
[Fe+4]
[229Th]
[Th+4]
[As+3]
[NH+3]
[P@H]
[Li-]
[7NaH]
[Bi+]
[PtH+2]
[p-]
[Re+5]
[NiH]
[Ni-]
[Xe+]
[Ca+]
[11c]
[Rh+4]
[AcH]
[HeH]
[Sc+2]
[Mn+]
[UH]
[14CH2]
[SiH4+]
[18OH2]
[Ac-]
[Re+4]
[118Sn]
[153Sm]
[P+2]
[9CH]
[9CH3]
[Y-]
[NiH2]
[Si+2]
[Mn+6]
[ZrH2]
[C-2]
[Bi+5]
[24NaH]
[Fr]
[15CH]
[Se+]
[At]
[P-3]
[124I-]
[CuH2-]
[Nb+4]
[Nb+3]
[MgH]
[Ir+4]
[67Ga+3]
[67Ga]
[13N]
[15OH2]
[2NH]
[Ho]
[Cn]
 No newline at end of file
+31 −0
Original line number Diff line number Diff line
# Requirements - transformers, tokenizers
import os
from unittest import TestCase
from deepchem.feat.smiles_tokenizer import SmilesTokenizer
from transformers import RobertaForMaskedLM


class TestSmilesTokenizer(TestCase):
  """Tests the SmilesTokenizer to load the USPTO vocab file and a ChemBERTa Masked LM model with pre-trained weights.."""

  def test_tokenize(self):
    current_dir = os.path.dirname(os.path.realpath(__file__))
    vocab_path = os.path.join(current_dir, 'data', 'vocab.txt')
    tokenized_smiles = [
        12, 16, 16, 16, 17, 16, 16, 18, 16, 19, 16, 17, 22, 19, 18, 33, 17, 16,
        18, 23, 181, 17, 22, 19, 18, 17, 19, 16, 33, 20, 19, 55, 17, 16, 23, 18,
        17, 33, 17, 19, 18, 35, 20, 19, 18, 16, 20, 22, 16, 16, 22, 16, 21, 23,
        20, 23, 22, 16, 23, 22, 16, 21, 23, 18, 19, 16, 20, 22, 16, 16, 22, 16,
        16, 22, 16, 20, 13
    ]

    model = RobertaForMaskedLM.from_pretrained(
        'seyonec/SMILES_tokenized_PubChem_shard00_50k')
    model.num_parameters()

    tokenizer = SmilesTokenizer(
        vocab_path, max_len=model.config.max_position_embeddings)

    assert tokenized_smiles == tokenizer.encode(
        "CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1"
    )
+1 −0
Original line number Diff line number Diff line
@@ -137,6 +137,7 @@ discussions about research, development or any general questions. If you'd like
   dataclasses
   moleculenet
   featurizers
   tokenizers
   splitters
   transformers
   models
Loading