Skip to content

Commit a184716

Browse files
committed
fix: removed tokenizer
1 parent 58b1133 commit a184716

File tree

2 files changed

+1
-14
lines changed

2 files changed

+1
-14
lines changed

scrapegraphai/utils/tokenizer.py

-8
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from langchain_ollama import ChatOllama
77
from langchain_mistralai import ChatMistralAI
88
from langchain_core.language_models.chat_models import BaseChatModel
9-
from transformers import GPT2TokenizerFast
109

1110
def num_tokens_calculus(string: str, llm_model: BaseChatModel) -> int:
1211
"""
@@ -24,13 +23,6 @@ def num_tokens_calculus(string: str, llm_model: BaseChatModel) -> int:
2423
from .tokenizers.tokenizer_ollama import num_tokens_ollama
2524
num_tokens_fn = num_tokens_ollama
2625

27-
elif isinstance(llm_model, GPT2TokenizerFast):
28-
def num_tokens_gpt2(text: str, model: BaseChatModel) -> int:
29-
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
30-
tokens = tokenizer.encode(text)
31-
return len(tokens)
32-
num_tokens_fn = num_tokens_gpt2
33-
3426
else:
3527
from .tokenizers.tokenizer_openai import num_tokens_openai
3628
num_tokens_fn = num_tokens_openai

scrapegraphai/utils/tokenizers/tokenizer_ollama.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
"""
44
from langchain_core.language_models.chat_models import BaseChatModel
55
from ..logging import get_logger
6-
from transformers import GPT2TokenizerFast
76

87
def num_tokens_ollama(text: str, llm_model:BaseChatModel) -> int:
98
"""
@@ -22,12 +21,8 @@ def num_tokens_ollama(text: str, llm_model:BaseChatModel) -> int:
2221

2322
logger.debug(f"Counting tokens for text of {len(text)} characters")
2423

25-
if isinstance(llm_model, GPT2TokenizerFast):
26-
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
27-
tokens = tokenizer.encode(text)
28-
return len(tokens)
29-
3024
# Use langchain token count implementation
3125
# NB: https://github.com/ollama/ollama/issues/1716#issuecomment-2074265507
3226
tokens = llm_model.get_num_tokens(text)
3327
return tokens
28+

0 commit comments

Comments
 (0)