Skip to content

Commit 720c0a9

Browse files
committed
fix: resolve tokenizer test generation issues in CI and local environments
1 parent 5d46f96 commit 720c0a9

File tree

1 file changed

+20
-4
lines changed

1 file changed

+20
-4
lines changed

tests/generate_tests.py

+20-4
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,24 @@
33

44
import json
55
import os
6+
import sys
67
from itertools import product
78

89
from transformers import AutoTokenizer, AutoConfig
910
import numpy as np
1011

1112
from scripts.supported_models import SUPPORTED_MODELS
1213

14+
# Handle protobuf compatibility issues by setting the environment variable if not already set
15+
# This is one of the workarounds mentioned in the error message
16+
if 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION' not in os.environ:
17+
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
18+
19+
# Check if we should run in local mode (safer settings)
20+
LOCAL_MODE = '--local' in sys.argv
21+
if LOCAL_MODE:
22+
print("Running in local mode with safer settings")
23+
1324
# List of tokenizers where the model isn't yet supported, but the tokenizer is
1425
ADDITIONAL_TOKENIZERS_TO_TEST = {
1526
'falcon': [
@@ -284,9 +295,12 @@ def generate_tokenizer_tests():
284295
)
285296

286297
else:
298+
# In local mode, always use slow tokenizers to avoid protobuf issues
299+
use_fast = not LOCAL_MODE
287300
decoder_tokenizer = tokenizer = AutoTokenizer.from_pretrained(
288301
tokenizer_name,
289-
trust_remote_code=True)
302+
trust_remote_code=True,
303+
use_fast=use_fast)
290304

291305
except (KeyError, EnvironmentError):
292306
# If a KeyError/EnvironmentError is raised from the AutoTokenizer, it
@@ -333,11 +347,13 @@ def generate_tokenizer_tests():
333347

334348
for tokenizer_id in TOKENIZERS_WITH_CHAT_TEMPLATES:
335349
print(f'Generating chat templates for {tokenizer_id}')
350+
351+
# In local mode, use safer settings
352+
use_fast = not LOCAL_MODE or 'llama' not in tokenizer_id
353+
336354
tokenizer = AutoTokenizer.from_pretrained(
337355
tokenizer_id,
338-
339-
# TODO: Remove once https://github.com/huggingface/transformers/pull/26678 is fixed
340-
use_fast='llama' not in tokenizer_id,
356+
use_fast=use_fast,
341357
trust_remote_code=True,
342358
)
343359
tokenizer_results = []

0 commit comments

Comments
 (0)