Skip to content

Commit

Permalink
Fix bloom add prefix space (#25652)
Browse files Browse the repository at this point in the history
* properly support Sequence of pretokenizers

* actual fix

* make sure the fix works. Tests are not working for sure!

* hacky way

* add TODO

* update

* add a todo

* nits

* rename test

* nits

* rename test
  • Loading branch information
ArthurZucker authored and sgugger committed Aug 23, 2023
1 parent 9d42e40 commit e82040e
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/transformers/models/bloom/tokenization_bloom_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def __init__(

if add_prefix_space:
pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
decoder_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state)
self.backend_tokenizer.decoder = pickle.loads(decoder_state)

Expand Down
7 changes: 7 additions & 0 deletions tests/models/bloom/test_tokenization_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,10 @@ def test_pretrained_model_lists(self):
# maximum sequence length of the positoonal embeddings.
self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1)
self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)

def test_add_prefix_space_fast(self):
tokenizer_w_prefix = self.get_rust_tokenizer(add_prefix_space=True)
tokenizer_wo_prefix = self.get_rust_tokenizer(add_prefix_space=False)
tokens_w_prefix = tokenizer_w_prefix.tokenize("Hey")
tokens_wo_prefix = tokenizer_wo_prefix.tokenize("Hey")
self.assertNotEqual(tokens_w_prefix, tokens_wo_prefix)

0 comments on commit e82040e

Please sign in to comment.