From a18471688f0b79f06fb7078b01b68eeddc88eae4 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 19 Oct 2024 07:18:56 +0200 Subject: [PATCH] fix: removed tokenizer --- scrapegraphai/utils/tokenizer.py | 8 -------- scrapegraphai/utils/tokenizers/tokenizer_ollama.py | 7 +------ 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/scrapegraphai/utils/tokenizer.py b/scrapegraphai/utils/tokenizer.py index 8d5577fd..f6650672 100644 --- a/scrapegraphai/utils/tokenizer.py +++ b/scrapegraphai/utils/tokenizer.py @@ -6,7 +6,6 @@ from langchain_ollama import ChatOllama from langchain_mistralai import ChatMistralAI from langchain_core.language_models.chat_models import BaseChatModel -from transformers import GPT2TokenizerFast def num_tokens_calculus(string: str, llm_model: BaseChatModel) -> int: """ @@ -24,13 +23,6 @@ def num_tokens_calculus(string: str, llm_model: BaseChatModel) -> int: from .tokenizers.tokenizer_ollama import num_tokens_ollama num_tokens_fn = num_tokens_ollama - elif isinstance(llm_model, GPT2TokenizerFast): - def num_tokens_gpt2(text: str, model: BaseChatModel) -> int: - tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") - tokens = tokenizer.encode(text) - return len(tokens) - num_tokens_fn = num_tokens_gpt2 - else: from .tokenizers.tokenizer_openai import num_tokens_openai num_tokens_fn = num_tokens_openai diff --git a/scrapegraphai/utils/tokenizers/tokenizer_ollama.py b/scrapegraphai/utils/tokenizers/tokenizer_ollama.py index feb59e6b..a981e25c 100644 --- a/scrapegraphai/utils/tokenizers/tokenizer_ollama.py +++ b/scrapegraphai/utils/tokenizers/tokenizer_ollama.py @@ -3,7 +3,6 @@ """ from langchain_core.language_models.chat_models import BaseChatModel from ..logging import get_logger -from transformers import GPT2TokenizerFast def num_tokens_ollama(text: str, llm_model:BaseChatModel) -> int: """ @@ -22,12 +21,8 @@ def num_tokens_ollama(text: str, llm_model:BaseChatModel) -> int: logger.debug(f"Counting tokens for text of {len(text)} characters") - if isinstance(llm_model, GPT2TokenizerFast): - tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") - tokens = tokenizer.encode(text) - return len(tokens) - # Use langchain token count implementation # NB: https://github.com/ollama/ollama/issues/1716#issuecomment-2074265507 tokens = llm_model.get_num_tokens(text) return tokens +