Merge branch 'turboderp:master' into code-chat

turboderp · Oct 4, 2023 · fe047c4 · fe047c4
2 parents 433c1fa + a03d9bd
commit fe047c4
Show file tree

Hide file tree

Showing 7 changed files with 340 additions and 77 deletions.
diff --git a/README.md b/README.md
@@ -171,4 +171,7 @@ the converter to better facilitate scripted jobs.
 
 **2023-09-27**: Prebuilt wheels are now available, credit to [@jllllll](https://github.com/jllllll). They're on the
 [releases page here](https://github.com/turboderp/exllamav2/releases). A solution to installing prebuilt wheels straight
-from PyPI is still pending. Updated installation instructions above.
+from PyPI is still pending. Updated installation instructions above.
+
+**2023-10-03**: Added support for extended vocabularies and alternative BOS/EOS/UNK tokens and the ability to 
+encode/decode sequences with special tokens. Added Orca template to the chatbot example.
diff --git a/examples/chat.py b/examples/chat.py
@@ -19,11 +19,14 @@
 )
 
 from chat_formatting import CodeBlockFormatter
+from chat_prompts import prompt_formats
+prompt_formats_list = list(prompt_formats.keys())
 
 # Options
 
 parser = argparse.ArgumentParser(description = "Simple Llama2 chat example for ExLlamaV2")
-parser.add_argument("-mode", "--mode", choices = ["llama", "raw", "codellama"], help = "Chat mode. Use llama for Llama 1/2 chat finetunes.")
+parser.add_argument("-modes", "--modes", action = "store_true", help = "List available modes and exit.")
+parser.add_argument("-mode", "--mode", choices = prompt_formats_list, help = "Chat mode. Use llama for Llama 1/2 chat finetunes.")
 parser.add_argument("-un", "--username", type = str, default = "User", help = "Username when using raw chat mode")
 parser.add_argument("-bn", "--botname", type = str, default = "Chatbort", help = "Bot name when using raw chat mode")
 parser.add_argument("-sp", "--system_prompt", type = str, help = "Use custom system prompt")
@@ -37,86 +40,61 @@
 parser.add_argument("-resc", "--response_chunk", type = int, default = 250, help = "Space to reserve in context for reply, default = 250")
 parser.add_argument("-ncf", "--no_code_formatting", action = "store_true", help = "Disable code formatting/syntax highlighting")
 
-# Initialize model and tokenizer
+# Arrrgs
 
 model_init.add_args(parser)
 args = parser.parse_args()
-model_init.check_args(args)
-model_init.print_options(args)
-model, tokenizer = model_init.init(args)
 
-# Create cache
-
-cache = ExLlamaV2Cache(model)
+# Prompt templates/modes
 
-# Prompt templates
+if args.modes:
+    print(" -- Available formats:")
+    for k, v in prompt_formats.items():
+        print(f" --   {k:12} : {v().description}")
+    sys.exit()
 
 username = args.username
 botname = args.botname
 system_prompt = args.system_prompt
-mode = args.mode
-
-if mode == "llama" or mode == "codellama":
-
-    if not system_prompt:
-
-        if mode == "llama":
-
-            system_prompt = \
-            """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  """ + \
-            """Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. """ + \
-            """Please ensure that your responses are socially unbiased and positive in nature."""
-
-        elif mode == "codellama":
-
-            system_prompt = \
-            """You are a helpful coding assistant. Always answer as helpfully as possible."""
-
-    first_prompt = \
-    """[INST] <<SYS>>\n<|system_prompt|>\n<</SYS>>\n\n<|user_prompt|> [/INST]"""
 
-    subs_prompt = \
-    """[INST] <|user_prompt|> [/INST]"""
-
-elif mode == "raw":
+if args.mode is None:
+    print(" ## Error: No mode specified.")
+    sys.exit()
 
-    if not system_prompt:
+prompt_format = prompt_formats[args.mode]()
+prompt_format.botname = botname
+prompt_format.username = username
+if system_prompt is None: system_prompt = prompt_format.default_system_prompt()
 
-        system_prompt = \
-        f"""This is a conversation between a helpful AI assistant named {botname} and a """ + ("""user named {username}.""" if username != "User" else """user.""")
+# Initialize model and tokenizer
 
-    first_prompt = \
-    f"""<|system_prompt|>\n{username}: <|user_prompt|>\n{botname}:"""
+model_init.check_args(args)
+model_init.print_options(args)
+model, tokenizer = model_init.init(args)
 
-    subs_prompt = \
-    f"""{username}: <|user_prompt|>\n{botname}:"""
+# Create cache
 
-else:
+cache = ExLlamaV2Cache(model)
 
-    print(" ## Error: Incorrect/no mode specified.")
-    sys.exit()
 
 # Chat context
 
 def format_prompt(user_prompt, first):
-    global system_prompt, first_prompt, subs_prompt
+    global system_prompt, prompt_format
 
     if first:
-        return first_prompt \
+        return prompt_format.first_prompt() \
             .replace("<|system_prompt|>", system_prompt) \
             .replace("<|user_prompt|>", user_prompt)
     else:
-        return subs_prompt \
+        return prompt_format.subs_prompt() \
             .replace("<|user_prompt|>", user_prompt)
 
 def encode_prompt(text):
-    global tokenizer, mode
+    global tokenizer, prompt_format
 
-    if mode == "llama" or mode == "codellama":
-        return tokenizer.encode(text, add_bos = True)
-
-    if mode == "raw":
-        return tokenizer.encode(text)
+    add_bos, add_eos, encode_special_tokens = prompt_format.encoding_options()
+    return tokenizer.encode(text, add_bos = add_bos, add_eos = add_eos, encode_special_tokens = encode_special_tokens)
 
 user_prompts = []
 responses_ids = []
@@ -130,7 +108,8 @@ def get_tokenized_context(max_len):
 
         for turn in range(len(user_prompts)):
 
-            up_ids = encode_prompt(format_prompt(user_prompts[turn], context.shape[-1] == 0))
+            up_text = format_prompt(user_prompts[turn], context.shape[-1] == 0)
+            up_ids = encode_prompt(up_text)
             context = torch.cat([context, up_ids], dim=-1)
 
             if turn < len(responses_ids):
@@ -161,20 +140,15 @@ def get_tokenized_context(max_len):
 
 # Stop conditions
 
-if mode == "llama" or mode == "codellama":
-
-    generator.set_stop_conditions([tokenizer.eos_token_id])
-
-if mode == "raw":
-
-    generator.set_stop_conditions([username + ":", username[0:1] + ":", username.upper() + ":", username.lower() + ":", tokenizer.eos_token_id])
+generator.set_stop_conditions(prompt_format.stop_conditions(tokenizer))
 
 # ANSI color codes
 
 col_default = "\u001b[0m"
 col_user = "\u001b[33;1m"  # Yellow
 col_bot = "\u001b[34;1m"  # Blue
 col_error = "\u001b[31;1m"  # Magenta
+col_sysprompt = "\u001b[37;1m"  # Grey
 
 # Code block formatting
 
@@ -188,6 +162,11 @@ def get_tokenized_context(max_len):
 
 # Main loop
 
+print(f" -- Prompt format: {args.mode}")
+print(f" -- System prompt:")
+print()
+print(col_sysprompt + system_prompt.strip() + col_default)
+
 while True:
 
     # Get user prompt
@@ -207,7 +186,7 @@ def get_tokenized_context(max_len):
 
     # Stream response
 
-    if mode == "raw":
+    if prompt_format.print_bot_name():
 
         print(col_bot + botname + ": " + col_default, end = "")
 
@@ -288,7 +267,7 @@ def get_tokenized_context(max_len):
 
         if eos:
 
-            if mode == "llama" or mode == "codellama":
+            if prompt_format.print_extra_newline():
                 print()
 
             break

diff --git a/examples/chat_prompts.py b/examples/chat_prompts.py
@@ -0,0 +1,177 @@
+
+class PromptFormat:
+
+    botname = "Chatbort"
+    username = "User"
+
+    def __init__(self):
+        pass
+
+    #
+
+    def default_system_prompt(self):
+        raise NotImplementedError
+
+    def first_prompt(self):
+        raise NotImplementedError
+
+    def subs_prompt(self):
+        raise NotImplementedError
+
+    def stop_conditions(self, tokenizer):
+        raise NotImplementedError
+
+    def encoding_options(self):  # (add_bos, add_eos, encode_special_tokens)
+        raise NotImplementedError
+
+    def print_bot_name(self):
+        return False
+
+    def print_extra_newline(self):
+        return False
+
+
+class PromptFormat_raw(PromptFormat):
+
+    description = "Model-agnostic mode simulating a raw chatlog"
+
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def default_system_prompt(self):
+        return \
+            f"""This is a conversation between a helpful AI assistant named {self.botname} and a """ + \
+            (f"""user named {self.username}.""" if self.username != "User" else """user.""")
+
+    def first_prompt(self):
+        return \
+            f"""<|system_prompt|>\n{self.username}: <|user_prompt|>\n{self.botname}:"""
+
+    def subs_prompt(self):
+        return \
+            f"""{self.username}: <|user_prompt|>\n{self.botname}:"""
+
+    def stop_conditions(self, tokenizer):
+        return \
+            [self.username + ":",
+             self.username[0:1] + ":",
+             self.username.upper() + ":",
+             self.username.lower() + ":",
+             tokenizer.eos_token_id]
+
+    def encoding_options(self):
+        return False, False, False
+
+    def print_bot_name(self):
+        return True
+
+
+class PromptFormat_llama(PromptFormat):
+
+    description = "Llama-chat, Llama2-chat and Mistral-instruct models"
+
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def default_system_prompt(self):
+        return \
+            """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  """ + \
+            """Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. """ + \
+            """Please ensure that your responses are socially unbiased and positive in nature."""
+
+    def first_prompt(self):
+        return \
+            """[INST] <<SYS>>\n<|system_prompt|>\n<</SYS>>\n\n<|user_prompt|> [/INST]"""
+
+    def subs_prompt(self):
+        return \
+            """[INST] <|user_prompt|> [/INST]"""
+
+    def stop_conditions(self, tokenizer):
+        return \
+            [tokenizer.eos_token_id]
+
+    def encoding_options(self):
+        return True, False, False
+
+    def print_extra_newline(self):
+        return True
+
+
+class PromptFormat_codellama(PromptFormat_llama):
+
+    description = "CodeLlama-instruct"
+
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def default_system_prompt(self):
+        return \
+            """You are a helpful coding assistant. Always answer as helpfully as possible."""
+
+
+class PromptFormat_chatml(PromptFormat):
+
+    description = "ChatML format, as used by e.g. (Mistral)Orca"
+
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def default_system_prompt(self):
+        return \
+            f"""You are {self.botname}, a large language model. Answer as concisely as possible."""
+
+    def first_prompt(self):
+        return \
+            """<|im_start|>system\n""" + \
+            """<|system_prompt|>\n""" + \
+            """<|im_end|>\n""" + \
+            """<|im_start|>user\n""" + \
+            """<|user_prompt|><|im_end|>\n""" + \
+            """<|im_start|>assistant\n"""
+
+    def subs_prompt(self):
+        return \
+            """<|im_end|>\n""" + \
+            """<|im_start|>user\n""" + \
+            """<|user_prompt|><|im_end|>\n""" + \
+            """<|im_start|>assistant\n"""
+
+    def stop_conditions(self, tokenizer):
+        return \
+            [tokenizer.eos_token_id,
+             """<|im_end|>"""]
+
+    def encoding_options(self):
+        return False, False, True
+
+    def print_extra_newline(self):
+        return True
+
+
+class PromptFormat_tinyllama(PromptFormat_chatml):
+
+    description = "ChatML format, but ignoring special/added tokens. Use for TinyLlama-chat v0.3"
+
+    def encoding_options(self):
+        return False, False, False
+
+
+prompt_formats = \
+{
+    "raw": PromptFormat_raw,
+    "llama": PromptFormat_llama,
+    "codellama": PromptFormat_codellama,
+    "chatml": PromptFormat_chatml,
+    "tinyllama": PromptFormat_tinyllama,
+}
+
+
+
+
+
+