Skip to content

Commit

Permalink
fix llama3 eot. (#8371)
Browse files Browse the repository at this point in the history
  • Loading branch information
ZHUI committed May 13, 2024
1 parent d9f555e commit 474aaaa
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions paddlenlp/transformers/llama/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,11 +295,12 @@ def _pad(
ENDOFTEXT = "<|end_of_text|>"
IMSTART = "<|start_header_id|>"
IMEND = "<|end_header_id|>"
EOTID = "<|eot_id|>"
# as the default behavior is changed to allow special tokens in
# regular texts, the surface forms of special tokens need to be
# as different as possible to minimize the impact
EXTRAS = tuple((f"<|reserved_special_token_{i}|>" for i in range(250)))
SPECIAL_TOKENS = (BEGINOFTEXT, ENDOFTEXT) + EXTRAS[0:4] + (IMSTART, IMEND) + EXTRAS[4:]
EXTRAS = tuple((f"<|reserved_special_token_{i}|>" for i in range(251)))
SPECIAL_TOKENS = (BEGINOFTEXT, ENDOFTEXT) + EXTRAS[0:4] + (IMSTART, IMEND, EXTRAS[4], EOTID) + EXTRAS[5:]

tiktoken = None

Expand Down Expand Up @@ -354,9 +355,11 @@ def __init__(

self.tokenizer = enc # type: tiktoken.Encoding

self.bod_id = self.special_tokens[BEGINOFTEXT]
self.eod_id = self.special_tokens[ENDOFTEXT]
self.start_header_id = self.special_tokens[IMSTART]
self.end_header_id = self.special_tokens[IMEND]
self.eot_id = self.special_tokens[EOTID]

if "pad_token_id" in kwargs:
self.pad_token_id = kwargs["pad_token_id"]
Expand Down

0 comments on commit 474aaaa

Please sign in to comment.