Skip to content

Commit

Permalink
fix: removed tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
VinciGit00 committed Oct 19, 2024
1 parent 58b1133 commit a184716
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 14 deletions.
8 changes: 0 additions & 8 deletions scrapegraphai/utils/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from langchain_ollama import ChatOllama
from langchain_mistralai import ChatMistralAI
from langchain_core.language_models.chat_models import BaseChatModel
from transformers import GPT2TokenizerFast

def num_tokens_calculus(string: str, llm_model: BaseChatModel) -> int:
"""
Expand All @@ -24,13 +23,6 @@ def num_tokens_calculus(string: str, llm_model: BaseChatModel) -> int:
from .tokenizers.tokenizer_ollama import num_tokens_ollama
num_tokens_fn = num_tokens_ollama

elif isinstance(llm_model, GPT2TokenizerFast):
def num_tokens_gpt2(text: str, model: BaseChatModel) -> int:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokens = tokenizer.encode(text)
return len(tokens)
num_tokens_fn = num_tokens_gpt2

else:
from .tokenizers.tokenizer_openai import num_tokens_openai
num_tokens_fn = num_tokens_openai
Expand Down
7 changes: 1 addition & 6 deletions scrapegraphai/utils/tokenizers/tokenizer_ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
"""
from langchain_core.language_models.chat_models import BaseChatModel
from ..logging import get_logger
from transformers import GPT2TokenizerFast

def num_tokens_ollama(text: str, llm_model:BaseChatModel) -> int:
"""
Expand All @@ -22,12 +21,8 @@ def num_tokens_ollama(text: str, llm_model:BaseChatModel) -> int:

logger.debug(f"Counting tokens for text of {len(text)} characters")

if isinstance(llm_model, GPT2TokenizerFast):
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokens = tokenizer.encode(text)
return len(tokens)

# Use langchain token count implementation
# NB: https://github.com/ollama/ollama/issues/1716#issuecomment-2074265507
tokens = llm_model.get_num_tokens(text)
return tokens

0 comments on commit a184716

Please sign in to comment.