|
3 | 3 |
|
4 | 4 | import json
|
5 | 5 | import os
|
| 6 | +import sys |
6 | 7 | from itertools import product
|
7 | 8 |
|
8 | 9 | from transformers import AutoTokenizer, AutoConfig
|
9 | 10 | import numpy as np
|
10 | 11 |
|
11 | 12 | from scripts.supported_models import SUPPORTED_MODELS
|
12 | 13 |
|
| 14 | +# Handle protobuf compatibility issues by setting the environment variable if not already set |
| 15 | +# This is one of the workarounds mentioned in the error message |
| 16 | +if 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION' not in os.environ: |
| 17 | + os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python' |
| 18 | + |
| 19 | +# Check if we should run in local mode (safer settings) |
| 20 | +LOCAL_MODE = '--local' in sys.argv |
| 21 | +if LOCAL_MODE: |
| 22 | + print("Running in local mode with safer settings") |
| 23 | + |
13 | 24 | # List of tokenizers where the model isn't yet supported, but the tokenizer is
|
14 | 25 | ADDITIONAL_TOKENIZERS_TO_TEST = {
|
15 | 26 | 'falcon': [
|
@@ -284,9 +295,12 @@ def generate_tokenizer_tests():
|
284 | 295 | )
|
285 | 296 |
|
286 | 297 | else:
|
| 298 | + # In local mode, always use slow tokenizers to avoid protobuf issues |
| 299 | + use_fast = not LOCAL_MODE |
287 | 300 | decoder_tokenizer = tokenizer = AutoTokenizer.from_pretrained(
|
288 | 301 | tokenizer_name,
|
289 |
| - trust_remote_code=True) |
| 302 | + trust_remote_code=True, |
| 303 | + use_fast=use_fast) |
290 | 304 |
|
291 | 305 | except (KeyError, EnvironmentError):
|
292 | 306 | # If a KeyError/EnvironmentError is raised from the AutoTokenizer, it
|
@@ -333,11 +347,13 @@ def generate_tokenizer_tests():
|
333 | 347 |
|
334 | 348 | for tokenizer_id in TOKENIZERS_WITH_CHAT_TEMPLATES:
|
335 | 349 | print(f'Generating chat templates for {tokenizer_id}')
|
| 350 | + |
| 351 | + # In local mode, use safer settings |
| 352 | + use_fast = not LOCAL_MODE or 'llama' not in tokenizer_id |
| 353 | + |
336 | 354 | tokenizer = AutoTokenizer.from_pretrained(
|
337 | 355 | tokenizer_id,
|
338 |
| - |
339 |
| - # TODO: Remove once https://github.com/huggingface/transformers/pull/26678 is fixed |
340 |
| - use_fast='llama' not in tokenizer_id, |
| 356 | + use_fast=use_fast, |
341 | 357 | trust_remote_code=True,
|
342 | 358 | )
|
343 | 359 | tokenizer_results = []
|
|
0 commit comments