ggerganov · BramVanroy · May 15, 2024 · May 15, 2024 · ggerganov · May 17, 2024
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
@@ -77,6 +77,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "jina-v2-en",     "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
     {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
     {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
+    { "name": "phi-2",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
 ]
 
 # make directory "models/tokenizers" if it doesn't exist

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -469,6 +469,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
             # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
             res = "jina-v2-de"
+        if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
+            # ref: https://huggingface.co/microsoft/phi-2
+            res = "phi-2"
 // for now, only BPE models have pre-tokenizers 
 if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { 
     if (tokenizer_pre.empty()) { 
         LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__); 
         LLAMA_LOG_WARN("%s:                                             \n", __func__); 
         LLAMA_LOG_WARN("%s: ************************************        \n", __func__); 
         LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED!        \n", __func__); 
         LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL             \n", __func__); 
         LLAMA_LOG_WARN("%s: ************************************        \n", __func__); 
         LLAMA_LOG_WARN("%s:                                             \n", __func__); 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; 
     } else if ( 
             tokenizer_pre == "default") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; 
     } else if ( 
             tokenizer_pre == "llama3"   || 
             tokenizer_pre == "llama-v3" || 
             tokenizer_pre == "llama-bpe") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3; 
     } else if ( 
             tokenizer_pre == "deepseek-llm") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM; 
     } else if ( 
             tokenizer_pre == "deepseek-coder") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER; 
     } else if ( 
             tokenizer_pre == "falcon") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON; 
     } else if ( 
             tokenizer_pre == "mpt") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT; 
     } else if ( 
             tokenizer_pre == "starcoder") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER; 
     } else if ( 
             tokenizer_pre == "gpt-2"   || 
             tokenizer_pre == "jina-es" || 
             tokenizer_pre == "jina-de" || 
             tokenizer_pre == "jina-v2-es" || 
             tokenizer_pre == "jina-v2-de") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; 
     } else if ( 
             tokenizer_pre == "refact") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT; 
     } else if ( 
         tokenizer_pre == "command-r") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R; 
     } else if ( 
         tokenizer_pre == "qwen2") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2; 
     } else if ( 
         tokenizer_pre == "olmo") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO; 
     } else if ( 
         tokenizer_pre == "dbrx") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX; 
     } else { 
         throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); 
     } 
 } else { 
     vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; 
 } 
 // for now, only BPE models have pre-tokenizers 
 if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { 
     if (tokenizer_pre.empty()) { 
         LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__); 
         LLAMA_LOG_WARN("%s:                                             \n", __func__); 
         LLAMA_LOG_WARN("%s: ************************************        \n", __func__); 
         LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED!        \n", __func__); 
         LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL             \n", __func__); 
         LLAMA_LOG_WARN("%s: ************************************        \n", __func__); 
         LLAMA_LOG_WARN("%s:                                             \n", __func__); 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; 
     } else if ( 
             tokenizer_pre == "default") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; 
     } else if ( 
             tokenizer_pre == "llama3"   || 
             tokenizer_pre == "llama-v3" || 
             tokenizer_pre == "llama-bpe") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3; 
     } else if ( 
             tokenizer_pre == "deepseek-llm") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM; 
     } else if ( 
             tokenizer_pre == "deepseek-coder") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER; 
     } else if ( 
             tokenizer_pre == "falcon") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON; 
     } else if ( 
             tokenizer_pre == "mpt") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT; 
     } else if ( 
             tokenizer_pre == "starcoder") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER; 
     } else if ( 
             tokenizer_pre == "gpt-2"   || 
             tokenizer_pre == "jina-es" || 
             tokenizer_pre == "jina-de" || 
             tokenizer_pre == "jina-v2-es" || 
             tokenizer_pre == "jina-v2-de") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; 
     } else if ( 
             tokenizer_pre == "refact") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT; 
     } else if ( 
         tokenizer_pre == "command-r") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R; 
     } else if ( 
         tokenizer_pre == "qwen2") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2; 
     } else if ( 
         tokenizer_pre == "olmo") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO; 
     } else if ( 
         tokenizer_pre == "dbrx") { 
         vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX; 
     } else { 
         throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); 
     } 
 } else { 
     vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; 
 } 
 
         if res is None:
             logger.warning("\n")