Commit 1314bd1c authored by seyonechithrananda's avatar seyonechithrananda
Browse files

update yapf to 0.22

parent 0e0ed6e1
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -45,4 +45,5 @@ try:
  from deepchem.feat.smiles_tokenizer import BasicSmilesTokenizer
except ModuleNotFoundError:
  logger.warning(
      "HuggingFace transformers is not available. Please install using 'pip install transformers' to use the SmilesTokenizer")
      "HuggingFace transformers is not available. Please install using 'pip install transformers' to use the SmilesTokenizer"
  )
+9 −9
Original line number Diff line number Diff line
@@ -19,7 +19,8 @@ try:
  from transformers import BertTokenizer
except ModuleNotFoundError:
  logger.warning(
      "HuggingFace transformers is not available. Please install using 'pip install transformers' to use the SmilesTokenizer")
      "HuggingFace transformers is not available. Please install using 'pip install transformers' to use the SmilesTokenizer"
  )
"""
SMI_REGEX_PATTERN: str
    SMILES regex pattern for tokenization. Designed by Schwaller et. al. 
@@ -103,12 +104,11 @@ class SmilesTokenizer(BertTokenizer):
    self.max_len_sentences_pair = self.max_len - 3

    if not os.path.isfile(vocab_file):
      raise ValueError("Can't find a vocab file at path '{}'.".format(
          vocab_file))
      raise ValueError(
          "Can't find a vocab file at path '{}'.".format(vocab_file))
    self.vocab = load_vocab(vocab_file)
    self.highest_unused_index = max([
        i for i, v in enumerate(self.vocab.keys()) if v.startswith("[unused")
    ])
    self.highest_unused_index = max(
        [i for i, v in enumerate(self.vocab.keys()) if v.startswith("[unused")])
    self.ids_to_tokens = collections.OrderedDict(
        [(ids, tok) for tok, ids in self.vocab.items()])
    self.basic_tokenizer = BasicSmilesTokenizer()
+9 −7
Original line number Diff line number Diff line
@@ -11,12 +11,13 @@ class TestSmilesTokenizer(TestCase):
  def test_tokenize(self):
    current_dir = os.path.dirname(os.path.realpath(__file__))
    vocab_path = os.path.join(current_dir, 'data', 'vocab.txt')
    tokenized_smiles = [12, 16, 16, 16, 17, 16, 16, 18, 16, 19, 16, 17, 22, 19,
                        18, 33, 17, 16, 18, 23, 181, 17, 22, 19, 18, 17, 19, 16,
                        33, 20, 19, 55, 17, 16, 23, 18, 17, 33, 17, 19, 18, 35,
                        20, 19, 18, 16, 20, 22, 16, 16, 22, 16, 21, 23, 20, 23,
                        22, 16, 23, 22, 16, 21, 23, 18, 19, 16, 20, 22, 16, 16,
                        22, 16, 16, 22, 16, 20, 13]
    tokenized_smiles = [
        12, 16, 16, 16, 17, 16, 16, 18, 16, 19, 16, 17, 22, 19, 18, 33, 17, 16,
        18, 23, 181, 17, 22, 19, 18, 17, 19, 16, 33, 20, 19, 55, 17, 16, 23, 18,
        17, 33, 17, 19, 18, 35, 20, 19, 18, 16, 20, 22, 16, 16, 22, 16, 21, 23,
        20, 23, 22, 16, 23, 22, 16, 21, 23, 18, 19, 16, 20, 22, 16, 16, 22, 16,
        16, 22, 16, 20, 13
    ]

    model = RobertaForMaskedLM.from_pretrained(
        'seyonec/SMILES_tokenized_PubChem_shard00_50k')
@@ -26,4 +27,5 @@ class TestSmilesTokenizer(TestCase):
        vocab_path, max_len=model.config.max_position_embeddings)

    assert tokenized_smiles == tokenizer.encode(
        "CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1")
        "CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1"
    )